Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
self.basepath = Path(basepath) self.default_pattern = default_pattern self._segment_counts = None
def get_docs(self): #eq. get_docs, list_docs pass
return deepcopy(self)
path = Path(self.basepath, document) if as_str: path = str(path) return path
return [self.get_full_path(doc, as_str) for doc in self.get_docs()]
segment_counts = [] self._segment_counts = segment_counts for doc in segmented_docs: segment_counts.append(0) for segment in doc: segment_counts[-1] += 1 yield segment
if self.segment_counts() is None: raise ValueError("Not segmentized yet") for doc, segment_count in self.get_docs(), self.segment_counts(): for segment_no in range(segment_count): yield dict(doc, segment_no)
return self._segment_counts
#filenames, yields filenames if basepath is None: basepath = self.basepath if pattern is None: pattern = self.default_pattern items = self.yield_segments() if segments else self.get_docs() for doc in items: filename = os.path.join(basepath, pattern.format_map(doc)) yield filename
for args in zip(self.yield_filenames(basedir, pattern, segments), *args): yield function(*args, **kwargs)
format="{path.stem}.{segment:0{maxwidth}d}{path.suffix}", basepath=None, as_str=False): # segment_filenames (doc), yields path for segments segment_counts = self.segment_counts() if segment_counts is None: raise ValueError("No segments recorded.") maxwidth = len(str(max(segment_counts))) if basepath is None: basepath = self.basepath
for document, segment_no in self.segment_counts(): filename = format.format(path=document, maxwidth=maxwidth, segment=segment_no) segment_path = Path(basepath, filename) if as_str: yield str(segment_path) else: yield segment_path
return iter(self.get_full_paths(as_str=True))
return len(self.get_docs())
try: selection = self.__getitem__(index) except AttributeError: selection = self.get_docs()[index]
if isinstance(index, slice): return [self.get_full_path(doc, as_str=True) for doc in selection] else: return self.get_full_path(selection, as_str=True)
super().__init__(**kwargs) self.basepath = Path(basepath) self._segment_counts = None if filenames is None: self._files = [p.relative_to(self.basepath) for p in self.basepath.glob(glob_pattern)] else: paths = (Path(name) for name in filenames) if glob_pattern is not None: paths = (path for path in paths if path.match(glob_pattern)) self._files = list(paths)
return self._files
segment_counts = self.segment_counts() if segment_counts is None: raise ValueError("No segments recorded.") if basepath is None: basepath = self.basepath result = self.copy() result._segment_counts = 0 result.basepath = basepath result._files = list(self.segment_filenames(basepath='', **kwargs)) return result
super().__init__(**kwargs) self.metadata = pd.DataFrame(data)
return (t._asdict() for t in self.metadata.itertuples())
metadata_list = [] for filename in glob.glob(glob_pattern): basename, __ = os.path.splitext(os.path.basename(filename)) md = fn_pattern.match(basename).groupdict() md["basename"] = basename md["filename"] = filename metadata_list.append(md) metadata = pd.DataFrame(metadata_list) if index is not None: metadata = metadata.set_index(index) return metadata |