Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
#!/usr/bin/env python # -*- coding: utf-8 -*-
This module contains functions to calculate topic coherence provided by `DARIAH-DE`_. .. _DARIAH-DE: https://de.dariah.eu https://github.com/DARIAH-DE """
return sparse_bow
dictionary = pd.read_csv(path, header=None) return dictionary.to_dict()
try: return type_dictionary[token] type_dictionary[token] = len(type_dictionary) + 1 return type_dictionary[token]
self.topics = topics self.sparse_bow = sparse_bow self.type_dictionary = type_dictionary
for topic in self.topics.iterrows(): topic = [token2bow(token, self.type_dictionary) for token in topic[1]] bigrams.append(list(permutations(topic, 2))) else: bigrams.append(list(combinations(topic, 2))) return pd.Series(bigrams)
bow = self.sparse_bow.reset_index(level=1)['token_id'] occurences = pd.Series() pass else: keys = set() keys.add(bigram[0]) keys.add(bigram[1]) print(key) for doc in bow.groupby(level=0): if key in doc[1].values: total.add(doc[0]) return occurences
n = len(self.sparse_bow.index.levels[0]) k1 = occurences[str(pair[0])] k1k2 = k1.intersection(k2) numerator = len(k1k2) + e / n denominator = ((len(k1) + e) / n) * ((len(k2) + e) / n) if normalize: else:
n = len(self.sparse_bow.count(level=0)) k1 = occurences[str(pair[0])] k2 = occurences[str(pair[1])] numerator = len(k1k2) + e / n denominator = len(k2) + e / n return np.log(numerator / denominator)
self.topics = topics self.sparse_bow = sparse_bow self.N = len(self.topics.T)
scores = [] segmented_topics = self.segment_topics() occurences = self.calculate_occurences(bigrams=segmented_topics) for topic in segmented_topics: pmi = [] for pair in topic: pmi.append(self.pmi_umass(pair=pair, occurences=occurences, e=e)) if mean: else: scores.append((2 / (N * (N - 1))) * np.median(pmi)) return pd.Series(scores)
scores = [] N = len(self.topics.T) segmented_topics = self.segment_topics(permutation=True) occurences = self.calculate_occurences(bigrams=segmented_topics) for topic in segmented_topics: pmi = [] for pair in topic: pmi.append(self.pmi_uci(pair=pair, occurences=occurences, normalize=normalize, e=e)) if mean: scores.append((2 / (N * (N - 1))) * np.mean(pmi)) else: scores.append((2 / (N * (N - 1))) * np.median(pmi)) return pd.Series(scores) |