Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

# -*- coding: utf-8 -*- 

 

""" 

Maintaining Lists of Documents 

============================== 

 

A *document list* manages a list of documents. There are various 

implementations of varying powerfulness, all have the following in common: 

 

* A document list keeps a fixed list of documents in order, i.e. after you 

created the list you can call the iteration functions and get the same file 

at the same position (even if, e.g., the underlying directory changes). So 

these files can be matched with lists of document *contents*. 

 

* A document list separates a *base directory* with some way to form *file 

names*. Thus, you can easily create a mirror (of, e.g., files transformed 

in some way) in a different directory, or modify the way filenames are formed. 

 

 

""" 

__author__ = "DARIAH-DE" 

__authors__ = "Thorsten Vitt" 

__email__ = "thorsten.vitt@uni-wuerzburg.de" 

 

 

from pathlib import Path 

from itertools import zip_longest 

from abc import abstractmethod 

from copy import deepcopy 

 

class BaseDocList: 

""" 

Base class with common functionality. 

 

Users should not instantiate this but rather a specialized subclass like `PathDocList`. 

""" 

 

def __init__(self, basepath): 

self.basepath = Path(basepath) 

self._segment_counts = None 

 

def copy(self): 

return deepcopy(self) 

 

def full_path(self, document, as_str=False): 

""" 

Constructs a full path for the given document. 

 

Args: 

document: this is one document in the way the subclass chooses to 

represent documents. 

as_str (bool): if True, the result is a str, otherwise it is a `Path` 

 

Notes: 

The default implementation passed document on to `Path()`. 

Implementers will most probably want to override this. 

""" 

path = Path(self.basepath, document) 

if as_str: 

path = str(path) 

return path 

 

@abstractmethod 

def get_docs(self): 

""" 

Returns a sequence of documents, in the form the implementing class 

chooses. 

 

Note: 

Subclasses may implement a method `_get_item(self, index)`, with 

index being integer or slice, to speed access up. 

""" 

pass 

 

def full_paths(self, as_str=False): 

""" 

Returns a list of full paths. Calls full_path. 

""" 

return [self.full_path(doc, as_str) for doc in self.get_docs()] 

 

@abstractmethod 

def label(self, document): 

""" 

Returns a label suitable for the document. 

""" 

pass 

 

def __iter__(self): 

""" 

When used as an iterable, this object looks like an iterable of full paths. 

""" 

return iter(self.full_paths(as_str=True)) 

 

def __len__(self): 

""" 

When used as a sequence, this object looks like a sequence of full paths. 

""" 

return len(self.get_docs()) 

 

def __getitem__(self, index): 

""" 

When used as a sequence, this object looks like a sequence of full paths. 

""" 

try: 

selection = self._getitem(index) 

except AttributeError: 

selection = self.get_docs()[index] 

 

if isinstance(index, slice): 

return [self.full_path(doc, as_str=True) for doc in selection] 

else: 

return self.full_path(selection, as_str=True) 

 

def labels(self): 

""" 

Returns a list of (human-readable) labels for all documents. 

""" 

return [self.label(doc) for doc in self.get_docs()] 

 

def flatten_segments(self, segmented_docs): 

""" 

Records and flattens segment counts according to the stream of documents. 

 

Assume you have three documents 

 

| A : I am an example document 

| B : Me too 

| C : All examples reference themselves 

 

:: 

docs = SimpleDocList('.', filenames=['A','B','C']) 

 

Now, you have an (external) segmenter function that segments each document 

into segments each being at most two tokens long. The data structure your 

segmenter will produce looks similar to the following:: 

 

segmented_corpus = \ 

[[['I', 'am'], ['an', 'example'], ['document']], 

[['Me', 'too']], 

[['All', 'examples'], ['reference', 'themselves']]] 

 

Now, if you run ``docs.flatten_segments(self)``, it will do two things: it will 

record how many segments each document has (A: 3, B: 1, C: 2), and it will 

return a structure flattened by one level as in the following:: 

 

[['I', 'am'], ['an', 'example'], ['document'], ['Me', 'too'], 

['All', 'examples'], ['reference', 'themselves']] 

 

I.e. the result will look like a corpus of six shorter documents. This 

matches with the iteration you get when you call docs.segments(). 

 

Args: 

segmented_docs: Iterable of documents, each document being an 

iterable of segments. 

 

Returns: 

Iterable of segments. 

 

Notes: 

Instead of lists you will receive generators, but you can iterate 

over these as well. 

""" 

segment_counts = [] 

self._segment_counts = segment_counts 

for doc in segmented_docs: 

segment_counts.append(0) 

for segment in doc: 

segment_counts[-1] += 1 

yield segment 

 

def segment_counts(self): 

""" 

Returns an iterable of the number of each segments for each document. 

""" 

return self._segment_counts 

 

def segments(self): 

""" 

Yields a tuple (document, segment_no) for each segment, with document 

being the internal representation of each document and segment_count an 

integer starting at 0 

""" 

for document, segment_count in zip_longest(self.get_docs(), 

self.segment_counts()): 

if segment_count is None: 

yield (document, None) 

else: 

for segment_no in range(segment_count): 

yield (document, segment_no) 

 

def segment_filenames(self, 

format="{path.stem}.{segment:0{maxwidth}d}{path.suffix}", 

basepath=None, 

as_str=False): 

""" 

 

Args: 

format (str): A :obj:`strings.Formatter` pattern that describes how 

to form each filename. The following formatter variables are 

available: 

 

* path (:obj:`~pathlib.Path`): original file path 

* segment (`int`): current segment number 

* maxwidth (`int`): number of digits required for the largest 

segment number overall 

basepath: Base path for the file names. By default, self.basepath will be used. 

as_str (bool): Convert the result to strings. 

 

Yields: 

pathlib.Path: path for each segment 

Raises: 

ValueError: if no segments 

""" 

segment_counts = self.segment_counts() 

if segment_counts is None: 

raise ValueError("No segments recorded.") 

maxwidth = len(str(max(segment_counts))) 

if basepath is None: 

basepath = self.basepath 

 

for document, segment_no in self.segments(): 

filename = format.format(path=document, maxwidth=maxwidth, 

segment=segment_no) 

segment_path = Path(basepath, filename) 

if as_str: 

yield str(segment_path) 

else: 

yield segment_path 

 

 

 

 

 

class PathDocList(BaseDocList): 

""" 

Document list based on a list of Paths. 

""" 

 

def __init__(self, basepath, glob_pattern='*', filenames=None): 

""" 

Creates a new document list either from the given file names 

or by looking for files matching the glob_pattern in the basepath. 

 

Args: 

basepath (Path or str): Root directory where your corpus resides 

glob_pattern (str): A file glob pattern matching the files to 

include. 

filenames (list): An iterable of paths or file names relative to 

basepath. If `None`, look for files on the file system. 

""" 

self.basepath = Path(basepath) 

self._segment_counts = None 

if filenames is None: 

self._files = [p.relative_to(self.basepath) 

for p in self.basepath.glob(glob_pattern)] 

else: 

paths = (Path(name) for name in filenames) 

if glob_pattern is not None: 

paths = (path for path in paths if path.match(glob_pattern)) 

self._files = list(paths) 

 

def get_docs(self): 

return self._files 

 

def label(self, document): 

return document.stem 

 

def with_segment_files(self, basepath=None, **kwargs): 

""" 

Returns a copy of this list which has the recorded segment numbers 

incorporated into the file names. I.e., this version does not know 

anymore about segments but rather has a file name for each segment. 

 

Args: 

pattern (str): A `strings.Formatter` pattern that describes how 

to form each filename. The following formatter variables are 

available: 

 

path (Path): original file path 

segment (int): current segment number 

maxwidth (int): number of digits required for the largest 

segment number overall 

Raises: 

ValueError: if no segments 

""" 

segment_counts = self.segment_counts() 

if segment_counts is None: 

raise ValueError("No segments recorded.") 

if basepath is None: 

basepath = self.basepath 

result = self.copy() 

result._segment_counts = 0 

result.basepath = basepath 

result._files = list(self.segment_filenames(basepath='', **kwargs)) 

return result