Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

""" 

Handling Metadata for Corpora 

============================= 

 

This module contains functions for metadata handling 

provided by `DARIAH-DE`_. 

 

.. _DARIAH-DE: 

https://de.dariah.eu 

https://github.com/DARIAH-DE 

""" 

 

__author__ = "DARIAH-DE" 

__authors__ = "Thorsten Vitt" 

__email__ = "thorsten.vitt@uni-wuerzburg.de" 

 

import abc 

import glob 

import os 

import pandas as pd 

import regex 

 

class AbstractCorpus: 

 

def __init__(self, basedir='.', default_pattern='{Index}.txt'): 

self.basedir = basedir 

self.default_pattern = default_pattern 

self.segments = None 

 

@abc.abstractmethod 

def list_docs(self): 

""" 

Returns an iterable over Mappings that describe a corpus. 

 

It is expected that each Mapping has a key 'Index' with 

unique values, and that each call lists the documents in 

the same order. 

""" 

 

def list_segments(self): 

""" 

Returns an iterable over Mappings that describe a segmented corpus. 

 

There is a dict for each segment with the metadata, and an int-valued 

item `segment` identifies the segment within its document. 

""" 

if self.segments is None: 

raise ValueError("Not segmentized yet") 

for doc, segcount in self.list_docs(), self.segments: 

for seg in range(segcount): 

yield dict(doc, segment=seg) 

 

def filenames(self, basedir=None, pattern=None, segments=False): 

""" 

Yields a filename for each entry in the metadata. 

""" 

if basedir is None: 

basedir = self.basedir 

if pattern is None: 

pattern = self.default_pattern 

items = self.list_segments() if segments else self.list_docs() 

for doc in items: 

filename = os.path.join(basedir, pattern.format_map(doc)) 

yield filename 

 

def forall(self, function, *args, basedir=None, pattern=None, segments=False, **kwargs): 

""" 

Calls the given function for each filename. 

 

If additional positional arguments are given, they must each be an iterable 

of the same length that corresponds to the list of filenames. 

 

So this is essentially equivalent to: 

 

map(partial(function, **kwargs), self.filenames(basedir, pattern), *args) 

 

Todo: 

 

This could get two boolean arguments to enable convenience behaviour: 

 

tee: If True, call the function with a copy of the iterable. Can be 

as intermediate pipeline step that dumps something. 

consume: If True, consume function's result and return nothing. 

 

Both options would force the pipeline to run immediately. 

 

""" 

for args in zip(self.filenames(basedir, pattern, segments), *args): 

yield function(*args, **kwargs) 

 

def flatten_segments(self, documents): 

""" 

Flatten a segment structure and record segment counts. 

 

`documents` is expected to be an Iterable of documents that matches this 

corpus' metadata. Each document is expected to be an Iterable of segments. 

Each segment is expected to be an Iterable of features (like tokens). 

 

This method yields each segment, thus it flattens the structure by one 

level. Additionally, it keeps track of segments by counting the number of 

segments in each document in this object's `segments` field. Afterwards, 

you can pass `segments=True` to `filenames` and `forall`. 

""" 

self.segments = [] 

for document in documents: 

self.segments.append(0) 

for segment in document: 

self.segments[-1] += 1 

yield segment 

 

class TableCorpus(AbstractCorpus): 

 

def __init__(self, data, **kwargs): 

super().__init__(**kwargs) 

self.metadata = pd.DataFrame(data) 

 

def list_docs(self): 

return (t._asdict() for t in self.metadata.itertuples()) 

 

def fn2metadata(glob_pattern='corpus/*.txt', fn_pattern=regex.compile('(?<author>[^_]+)_(?<title>.+)'), index=None): 

""" 

Extracts basic metadata filenames. 

 

Args: 

glob_pattern (str): A glob pattern matching the files to list, cf. glob.glob 

fn_pattern (re.Regex): A regular expression that extracts metadata fields from the files' base name. 

The pattern must contain named groups (which have the form `(?<name>pattern)`, where name is 

the name of the metadata field and pattern is the part of the re matching this field name). 

The default will expect file names that contain a `_`, and it will assign everything before 

the _ to the `author` field and everything after to the `title` field. 

index (str): Name of the column that will be used as index column. If None, an artificial index 

(integers, starting at 0) will be used. 

Returns: 

pd.DataFrame with the following columns: 

* basename: filename without path or extension 

* filename: full filename 

* one column for every named pattern that matched for at least one file 

""" 

metadata_list = [] 

for filename in glob.glob(glob_pattern): 

basename, __ = os.path.splitext(os.path.basename(filename)) 

md = fn_pattern.match(basename).groupdict() 

md["basename"] = basename 

md["filename"] = filename 

metadata_list.append(md) 

metadata = pd.DataFrame(metadata_list) 

if index is not None: 

metadata = metadata.set_index(index) 

return metadata.sort_values('title').reset_index().drop('index', axis=1)