Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
#!/usr/bin/env python # -*- coding: utf-8 -*-
This module contains functions to calculate topic coherence provided by `DARIAH-DE`_. .. _DARIAH-DE: https://de.dariah.eu https://github.com/DARIAH-DE """
type_dictionary[token] = len(type_dictionary) + 1 return type_dictionary[token]
self.topics = topics
bigrams = [] for topic in self.topics.iterrows(): if permutation: bigrams.append(list(permutations(topic, 2))) else: bigrams.append(list(combinations(topic, 2))) return pd.Series(bigrams)
if isinstance(bigrams, set): pass else: keys = set() for topic in bigrams: keys.add(bigram[0]) for key in keys: total = set() for doc in bow.groupby(level=0): if key in doc[1].values: occurences[str(key)] = total return occurences
self.type_dictionary = type_dictionary self.sparse_bow = sparse_bow
n = len(self.sparse_bow.index.levels[0]) k1 = occurences[str(pair[0])] k2 = occurences[str(pair[1])] denominator = ((len(k1) + e) / n) * ((len(k2) + e) / n) if normalize: return np.log(numerator / denominator) / -np.log(numerator) else:
n = len(self.sparse_bow.count(level=0)) k1 = occurences[str(pair[0])] k1k2 = k1.intersection(k2) numerator = len(k1k2) + e / n denominator = len(k2) + e / n return np.log(numerator / denominator)
self.topics = topics self.sparse_bow = sparse_bow self.type_dictionary = type_dictionary
scores = [] N = len(self.topics.T) segmented_topics = self.segment_topics() occurences = self.calculate_occurences(bigrams=segmented_topics) pmi = [] for pair in topic: pmi.append(self.pmi_umass(pair=pair, occurences=occurences, e=e)) if mean: scores.append((2 / (N * (N - 1))) * np.mean(pmi)) else: scores.append((2 / (N * (N - 1))) * np.median(pmi)) return pd.Series(scores)
scores = [] N = len(self.topics.T) segmented_topics = self.segment_topics(permutation=True) occurences = self.calculate_occurences(bigrams=segmented_topics) for topic in segmented_topics: pmi = [] for pair in topic: if mean: scores.append((2 / (N * (N - 1))) * np.mean(pmi)) else: scores.append((2 / (N * (N - 1))) * np.median(pmi)) return pd.Series(scores) |