Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
#!/usr/bin/env python3 # -*- coding: utf-8 -*-
This module contains functions for various preprocessing steps provided by `DARIAH-DE`_.
.. _DARIAH-DE: https://de.dariah.eu https://github.com/DARIAH-DE """
format='%(levelname)s %(name)s: %(message)s')
"""
"""
self.basepath = Path(basepath)
""" Constructs a full path for the given document.
Args: document: this is one document in the way the subclass chooses to represent documents. as_str (bool): if True, the result is a str, otherwise it is a `Path`
Notes: The default implementation passed document on to `Path()`. Implementers will most probably want to override this. """
def get_docs(self): """ Returns a sequence of documents, in the form the implementing class chooses.
Note: Subclasses may implement a method `_get_item(self, index)`, with index being integer or slice, to speed access up. """ pass
""" Returns a list of full paths. Calls full_path. """
def label(self, document): """ Returns a label suitable for the document. """
""" When used as an iterable, this object looks like an iterable of full paths. """
""" When used as a sequence, this object looks like a sequence of full paths. """ return len(self.get_docs())
""" When used as a sequence, this object looks like a sequence of full paths. """ try: selection = self._getitem(index) selection = self.get_docs()[index]
else:
""" Returns a list of (human-readable) labels for all documents. """
""" Records and flattens segment counts according to the stream of documents.
Assume you have three documents
A : I am an example document B : Me too C : All examples reference themselves
docs = SimpleDocList('.', filenames=['A','B','C'])
Now, you have an (external) segmenter function that segments each document into segments each being at most two tokens long. The data structure your segmenter will produce looks similar to the following:
segmented_corpus = \ [[['I', 'am'], ['an', 'example'], ['document']], [['Me', 'too']], [['All', 'examples'], ['reference', 'themselves']]]
Now, if you run docs.flatten_segments(self), it will do two things: it will record how many segments each document has (A: 3, B: 1, C: 2), and it will return a structure flattened by one level as in the following:
[['I', 'am'], ['an', 'example'], ['document'], ['Me', 'too'], ['All', 'examples'], ['reference', 'themselves']]
I.e. the result will look like a corpus of six shorter documents. This matches with the iteration you get when you call docs.segments().
Args: segmented_docs: Iterable of documents, each document being an iterable of segments.
Returns: Iterable of segments.
Notes: Instead of lists you will receive generators, but you can iterate over these as well. """
""" Returns an iterable of the number of each segments for each document. """ return self._segment_counts
""" Yields a tuple (document, segment_no) for each segment, with document being the internal representation of each document and segment_count an integer starting at 0 """ for document, segment_count in zip_longest(self.get_docs(), self.segment_counts()): if segment_count is None: else:
format="{path.stem}.{segment:0{maxwidth}d}{path.suffix}", basepath=None, as_str=False): """
Args: pattern (str): A `strings.Formatter` pattern that describes how to form each filename. The following formatter variables are available:
path (Path): original file path segment (int): current segment number maxwidth (int): number of digits required for the largest segment number overall Raises: ValueError: if no segments """ segment_counts = self.segment_counts()
segment=segment_no) yield str(segment_path) else:
""" Document list based on a list of Paths. """
""" Creates a new document list either from the given file names or by looking for files matching the glob_pattern in the basepath.
Args: basepath (Path or str): Root directory where your corpus resides glob_pattern (str): A file glob pattern matching the files to include. filenames (list): An iterable of paths or file names relative to basepath. If `None`, look for files on the file system. """ for p in self.basepath.glob(glob_pattern)] else:
""" Returns a copy of this list which has the recorded segment numbers incorporated into the file names. I.e., this version does not know anymore about segments but rather has a file name for each segment.
Args: pattern (str): A `strings.Formatter` pattern that describes how to form each filename. The following formatter variables are available:
path (Path): original file path segment (int): current segment number maxwidth (int): number of digits required for the largest segment number overall Raises: ValueError: if no segments """ result = self.copy()
"""Opens TXT files using file paths.
Description: With this function you can read plain text files. Commit a list of full paths or one single path as argument. Use the function `create_document_list()` to create a list of your text files.
Args: doclist Union(list[str], str): List of all documents in the corpus or single path to TXT file.
Yields: Document.
Todo: * Separate metadata (author, header)
Example: >>> list(read_from_txt('corpus_txt/Doyle_AScandalinBohemia.txt'))[0][:20] 'A SCANDAL IN BOHEMIA' """
"""Opens TEI XML files using file paths.
Description: With this function you can read TEI encoded XML files. Commit a list of full paths or one single path as argument. Use the function `create_document_list()` to create a list of your XML files.
Args: doclist Union(list[str], str): List of all documents in the corpus or single path to TEI XML file.
Yields: Document.
Todo: * Seperate metadata (author, header)?
Example: >>> list(read_from_tei('corpus_tei/Schnitzler_Amerika.xml'))[0][142:159] 'Arthur Schnitzler' """
"""Opens CSV files using file paths.
Description: With this function you can read CSV files generated by `DARIAH-DKPro-Wrapper`_, a tool for natural language processing. Commit a list of full paths or one single path as argument. You also have the ability to select certain columns. Use the function `create_document_list()` to create a list of your CSV files. .. _DARIAH-DKPro-Wrapper: https://github.com/DARIAH-DE/DARIAH-DKPro-Wrapper
Args: doclist Union(list[str], str): List of all documents in the corpus or single path to CSV file. columns (list[str]): List of CSV column names. Defaults to '['ParagraphId', 'TokenId', 'Lemma', 'CPOS', 'NamedEntity']'.
Yields: Document.
Todo: * Seperate metadata (author, header)?
Example: >>> list(read_from_csv('corpus_csv/Doyle_AScandalinBohemia.txt.csv'))[0][:4] # doctest: +NORMALIZE_WHITESPACE ParagraphId TokenId Lemma CPOS NamedEntity 0 0 0 a ART _ 1 0 1 scandal NP _ 2 0 2 in PP _ 3 0 3 bohemia NP _ """
"""Tokenizes with Unicode Regular Expressions.
Description: With this function you can tokenize a document with a regular expression. You also have the ability to commit your own regular expression. The default expression is '\p{Letter}+\p{Punctuation}?\p{Letter}+', which means one or more letters, followed by one or no punctuation, followed by one or more letters. So one letter words won't match. In case you want to lower alls tokens, set the argument `lower` to True (it is by default). If you want a very simple and primitive tokenization, set the argument `simple` to True. Use the functions `read_from_txt()`, `read_from_tei()` or `read_from_csv()` to read your text files.
Args: doc_txt (str): Document as string. expression (str): Regular expression to find tokens. lower (boolean): If True, lowers all words. Defaults to True. simple (boolean): Uses simple regular expression (r'\w+'). Defaults to False. If set to True, argument `expression` will be ignored.
Yields: Tokens
Example: >>> list(tokenize("This is one example text.")) ['this', 'is', 'one', 'example', 'text'] """ pattern = regex.compile(r'\w+') else:
"""Gets lemmas by selected POS-tags from DARIAH-DKPro-Wrapper output.
Description: With this function you can select certain columns of a CSV file generated by `DARIAH-DKPro-Wrapper`_, a tool for natural language processing. Use the function `read_from_csv()` to read CSV files. .. _DARIAH-DKPro-Wrapper: https://github.com/DARIAH-DE/DARIAH-DKPro-Wrapper
Args: doc_csv (DataFrame): DataFrame containing DARIAH-DKPro-Wrapper output. pos_tags (list[str]): List of DKPro POS-tags that should be selected. Defaults to '['ADJ', 'V', 'NN']'.
Yields: Lemma.
Example: >>> df = pd.DataFrame({'CPOS': ['CARD', 'ADJ', 'NN', 'NN'], ... 'Lemma': ['one', 'more', 'example', 'text']}) >>> list(filter_pos_tags(df))[0] # doctest: +NORMALIZE_WHITESPACE 1 more 2 example 3 text Name: Lemma, dtype: object """
"""Splits the given document by paragraphs.
Description: With this function you can split a document by paragraphs. You also have the ability to select a certain regular expression to split the document. Use the functions `read_from_txt()`, `read_from_tei()` or `read_from_csv()` to read your text files.
Args: doc_txt (str): Document text. sep (regex.Regex): Separator indicating a paragraph.
Returns: List of paragraphs.
Example: >>> split_paragraphs("This test contains \\n paragraphs.") ['This test contains ', ' paragraphs.'] """
"""Segments a document, tolerating existing chunks (like paragraphs).
Description: Consider you have a document. You wish to split the document into segments of about 1000 tokens, but you prefer to keep paragraphs together if this does not increase or decrease the token size by more than 5%.
Args: document: The document to process. This is an Iterable of chunks, each of which is an iterable of tokens. segment_size (int): The target length of each segment in tokens. tolerance (Number): How much may the actual segment size differ from the segment_size? If 0 < tolerance < 1, this is interpreted as a fraction of the segment_size, otherwise it is interpreted as an absolute number. If tolerance < 0, chunks are never split apart.
Yields: Segments. Each segment is a list of chunks, each chunk is a list of tokens.
Example: >>> list(segment_fuzzy([['This', 'test', 'is', 'very', 'clear'], ... ['and', 'contains', 'chunks']], 2)) # doctest: +NORMALIZE_WHITESPACE [[['This', 'test']], [['is', 'very']], [['clear'], ['and']], [['contains', 'chunks']]] """
# handle leftovers
tokenizer=None, flatten_chunks=False, materialize=False): """Segments a document into segments of about `segment_size` tokens, respecting existing chunks.
Description: Consider you have a document. You wish to split the document into segments of about 1000 tokens, but you prefer to keep paragraphs together if this does not increase or decrease the token size by more than 5%. This is a convenience wrapper around `segment_fuzzy()`.
Args: segment_size (int): The target size of each segment, in tokens. tolerance (Number): see `segment_fuzzy` chunker (callable): a one-argument function that cuts the document into chunks. If this is present, it is called on the given document. tokenizer (callable): a one-argument function that tokenizes each chunk. flatten_chunks (bool): if True, undo the effect of the chunker by chaining the chunks in each segment, thus each segment consists of tokens. This can also be a one-argument function in order to customize the un-chunking.
Example: >>> list(segment([['This', 'test', 'is', 'very', 'clear'], ... ['and', 'contains', 'chunks']], 2)) # doctest: +NORMALIZE_WHITESPACE [[['This', 'test']], [['is', 'very']], [['clear'], ['and']], [['contains', 'chunks']]] """
"""Removes features using feature list.
Description: With this function you can remove features from ppreprocessed files. Commit a list of features. Use the function `tokenize()` to access your files.
Args: doc_token_list Union(list[str], str): List of all documents in the corpus and their tokens. features_to_be_removed list[str]: List of features that should be removed Yields: cleaned token array
Todo:
Example: >>> doc_tokens = [['short', 'example', 'example', 'text', 'text']] >>> features_to_be_removed = ['example'] >>> test = remove_features_from_file(doc_tokens, features_to_be_removed) >>> list(test) [['short', 'text', 'text']] """ #log.info("Removing features ...") #get indices of features that should be deleted
"""Creates files for mallet import.
Description: With this function you can create preprocessed plain text files. Commit a list of full paths or one single path as argument. Use the function `remove_features_from_file()` to create a list of tokens per document.
Args: doc_tokens_cleaned Union(list[str], str): List of tokens per document doc_labels list[str]: List of documents labels.
Todo:
Example: >>> doc_labels = ['examplefile'] >>> doc_tokens_cleaned = [['short', 'example', 'text']] >>> create_mallet_import(doc_tokens_cleaned, doc_labels) >>> outpath = os.path.join('tutorial_supplementals', 'mallet_input') >>> os.path.isfile(os.path.join(outpath, 'examplefile.txt')) True """ #log.info("Generating mallet input files ...") os.makedirs(outpath)
"""Creates a dictionary of unique tokens with identifier.
Description: With this function you can create a dictionary of unique tokens as key and an identifier as value. Use the function `tokenize()` to tokenize your text files.
Args: tokens (list): List of tokens.
Returns: Dictionary.
Example: >>> create_dictionary(['example']) {'example': 1} """
"""Creates a dictionary of dictionaries.
Description: Only the function `create_sparse_bow()` uses this private function to create a dictionary of dictionaries. The first level consists of the document label as key, and the dictionary of counts as value. The second level consists of token ID as key, and the count of tokens in document pairs as value.
Args: doc_labels (list): List of doc labels. doc_tokens (list): List of tokens. type_dictionary (dict): Dictionary of {token: id}.
Returns: Dictionary of dictionaries.
Example: >>> doc_labels = ['exampletext'] >>> doc_tokens = [['short', 'example', 'example', 'text', 'text']] >>> type_dictionary = {'short': 1, 'example': 2, 'text': 3} >>> isinstance(_create_large_counter(doc_labels, doc_tokens, type_dictionary), defaultdict) True """ [type_dictionary[token] for token in tokens])
"""Creates a sparse index for pandas DataFrame.
Description: Only the function `create_sparse_bow()` uses this private function to create a pandas multiindex out of tuples. The multiindex represents document ID to token IDs relations.
Args: largecounter (dict): Dictionary of {document: {token: frequency}}.
Returns: Pandas MultiIndex.
Example: >>> doc_labels = ['exampletext'] >>> doc_tokens = [['short', 'example', 'example', 'text', 'text']] >>> type_dictionary = {'short': 1, 'example': 2, 'text': 3} >>> largecounter = _create_large_counter(doc_labels, doc_tokens, type_dictionary) >>> isinstance(_create_sparse_index(largecounter), pd.MultiIndex) True """ tuples, names=['doc_id', 'token_id'])
"""Creates sparse matrix for bag-of-words model.
Description: This function creates a sparse DataFrame ('bow' means `bag-of-words`_) containing document and type identifier as multiindex and type frequencies as values representing the counts of tokens for each token in each document. It is also the main function that incorporates the private functions `_create_large_counter()` and `_create_sparse_index()``. Use the function `get_labels()` for `doc_labels`, `tokenize()` for `doc_tokens`, and `create_dictionary()` for `type_dictionary` as well as for `doc_ids`. Use the function `create_dictionary()` to generate the dictionaries `type_dictionary` and `doc_dictionary`. .. _bag-of-words: https://en.wikipedia.org/wiki/Bag-of-words_model
Args: doc_labels (list[str]): List of doc labels as string. doc_tokens (list[str]): List of tokens as string. type_dictionary (dict[str]): Dictionary with {token: id}. doc_ids (dict[str]): Dictionary with {document label: id}.
Returns: Multiindexed Pandas DataFrame.
ToDo: * Test if it's necessary to build sparse_df_filled with int8 zeroes instead of int64. * Avoid saving sparse bow as .mm file to ingest into gensim.
Example: >>> doc_labels = ['exampletext'] >>> doc_tokens = [['short', 'example', 'text']] >>> type_dictionary = {'short': 1, 'example': 2, 'text': 3} >>> doc_ids = {'exampletext': 1} >>> len(create_sparse_bow(doc_labels, doc_tokens, type_dictionary, doc_ids)) 3 """ doc_labels, doc_tokens, type_dictionary) np.zeros((len(sparse_index), 1), dtype=int), index=sparse_index) sparse_index.get_level_values('doc_id'))
(doc_id, token_id), 0, int(largecounter[doc_id][token_id]))
"""Saves sparse matrix for bag-of-words model.
Description: With this function you can save the sparse matrix as `.mm file`_. .. _.mm file: http://math.nist.gov/MatrixMarket/formats.html#MMformat
Args: sparse_bow (DataFrame): DataFrame with term and term frequency by document. output (str): Path to output file without extension, e.g. /tmp/sparsebow.
Returns: None.
Example: >>> doc_labels = ['exampletext'] >>> doc_tokens = [['short', 'example', 'text']] >>> type_dictionary = {'short': 1, 'example': 2, 'text': 3} >>> doc_ids = {'exampletext': 1} >>> sparse_bow = create_sparse_bow(doc_labels, doc_tokens, type_dictionary, doc_ids) >>> save_sparse_bow(sparse_bow, 'sparsebow') >>> import os.path >>> os.path.isfile('sparsebow.mm') True """
" " + str(sum_counts) + "\n"
"""Creates a stopword list.
Description: With this function you can determine most frequent words, also known as stopwords. First, you have to translate your corpus into the bag-of-words model using the function `create_sparse_matrix()` and create an dictionary containing types and identifier using `create_dictionary()`.
Args: sparse_bow (DataFrame): DataFrame with term and term frequency by document. id_types (dict[str]): Dictionary with {token: id}. mfw (int): Target size of most frequent words to be considered.
Returns: Most frequent words in a list.
Example: >>> doc_labels = ['exampletext'] >>> doc_tokens = [['short', 'short', 'example', 'text']] >>> id_types = {'short': 1, 'example': 2, 'text': 3} >>> doc_ids = {'exampletext': 1} >>> sparse_bow = create_sparse_bow(doc_labels, doc_tokens, id_types, doc_ids) >>> find_stopwords(sparse_bow, id_types, 1) ['short'] """ sparse_bow.index.get_level_values('token_id')).sum() for key in sparse_bow_stopwords.index.get_level_values('token_id')]
"""Creates a list with hapax legommena.
Description: With this function you can determine hapax legomena for each document. First, you have to translate your corpus into the bag-of-words model using the function `create_sparse_matrix()` and create an dictionary containing types and identifier using `create_dictionary()`.
Args: sparse_bow (DataFrame): DataFrame with term and term frequency by document. id_types (dict[str]): Dictionary with {token: id}.
Returns: Hapax legomena in a list.
Example: >>> doc_labels = ['exampletext'] >>> doc_tokens = [['short', 'example', 'example', 'text', 'text']] >>> id_types = {'short': 1, 'example': 2, 'text': 3} >>> doc_ids = {'exampletext': 1} >>> sparse_bow = create_sparse_bow(doc_labels, doc_tokens, id_types, doc_ids) >>> find_hapax(sparse_bow, id_types) ['short'] """ sparse_bow.index.get_level_values('token_id')).sum() for key in sparse_bow_hapax.index.get_level_values('token_id')]
"""Removes features based on a list of words (types).
Description: With this function you can clean your corpus from stopwords and hapax legomena. First, you have to translate your corpus into the bag-of-words model using the function `create_sparse_bow()` and create a dictionary containing types and identifier using `create_dictionary()`. Use the functions `find_stopwords()` and `find_hapax()` to generate a feature list.
Args: sparse_bow (DataFrame): DataFrame with term and term frequency by document. features Union(set, list): Set or list containing features to remove. (not included) features (str): Text as iterable.
Returns: Clean corpus.
ToDo: * Adapt function to work with mm-corpus format.
Example: >>> doc_labels = ['exampletext'] >>> doc_tokens = [['short', 'example', 'example', 'text', 'text']] >>> id_types = {'short': 1, 'example': 2, 'text': 3} >>> doc_ids = {'exampletext': 1} >>> sparse_bow = create_sparse_bow(doc_labels, doc_tokens, id_types, doc_ids) >>> features = ['short'] >>> len(remove_features(sparse_bow, id_types, features)) 2 """
"""Creates doc2bow_list for gensim.
Description: With this function you can create a doc2bow_list as input for the gensim function `get_document_topics()` to show topics for each document.
Args: sparse_bow (DataFrame): DataFrame with term and term frequency by document.
Returns: List of lists containing tuples.
Example: >>> doc_labels = ['exampletext1', 'exampletext2'] >>> doc_tokens = [['test', 'corpus'], ['for', 'testing']] >>> type_dictionary = {'test': 1, 'corpus': 2, 'for': 3, 'testing': 4} >>> doc_dictionary = {'exampletext1': 1, 'exampletext2': 2} >>> sparse_bow = create_sparse_bow(doc_labels, doc_tokens, type_dictionary, doc_dictionary) >>> from gensim.models import LdaModel >>> from gensim.corpora import Dictionary >>> corpus = [['test', 'corpus'], ['for', 'testing']] >>> dictionary = Dictionary(corpus) >>> documents = [dictionary.doc2bow(document) for document in corpus] >>> model = LdaModel(corpus=documents, id2word=dictionary, iterations=1, passes=1, num_topics=1) >>> make_doc2bow_list(sparse_bow) [[(1, 1), (2, 1)], [(3, 1), (4, 1)]] """ sparse_bow.loc[doc].index, sparse_bow.loc[doc][0])]
"""Converts gensim output to DataFrame.
Description: With this function you can convert gensim output (usually a list of tuples) to a DataFrame, a more convenient datastructure.
Args: model: Gensim LDA model. num_keys (int): Number of top keywords for topic.
Returns: DataFrame.
ToDo:
Example: >>> from gensim.models import LdaModel >>> from gensim.corpora import Dictionary >>> corpus = [['test', 'corpus'], ['for', 'testing']] >>> dictionary = Dictionary(corpus) >>> documents = [dictionary.doc2bow(document) for document in corpus] >>> model = LdaModel(corpus=documents, id2word=dictionary, iterations=1, passes=1, num_topics=1) >>> isinstance(gensim2dataframe(model, 4), pd.DataFrame) True """ # num_topics = model.num_topics # topics_df = pd.DataFrame(index=range(num_topics), columns=range(num_keys)) # topics = model.show_topics( # num_topics=num_topics, log=False, formatted=False) # for topic in topics: # idx = topic[0] # temp = topic[1] # topics_df.loc[idx] = temp
columns= range(num_keys))
"""Use only for testing purposes, not working properly
Note:
Args:
Returns:
ToDo: make it work """ df = pd.DataFrame() for idx, doc in enumerate(doc2bow_list, 1): df[doc2id[idx]] = pd.Series( [value[1] for value in model.get_document_topics(doc)]) return df.fillna(0) |