Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
# -*- coding: utf-8 -*-
This module contains functions to calculate topic coherence provided by `DARIAH-DE`_. .. _DARIAH-DE: https://de.dariah.eu https://github.com/DARIAH-DE """
dictionary.index = dictionary[0]
return type_dictionary[token] except KeyError: return type_dictionary[token]
self.sparse_bow = sparse_bow self.type_dictionary = type_dictionary
topic = [token2bow(token, self.type_dictionary) for token in topic[1]] if permutation: else: bigrams.append(list(combinations(topic, 2))) return pd.Series(bigrams)
occurences = pd.Series() if isinstance(bigrams, set): else: keys = set() for topic in bigrams: keys.add(bigram[1]) for key in keys: total = set() if key in doc[1].values: total.add(doc[0]) occurences[str(key)] = total
k1 = occurences[str(pair[0])] k2 = occurences[str(pair[1])] numerator = len(k1k2) + e / n denominator = ((len(k1) + e) / n) * ((len(k2) + e) / n) if normalize: return np.log(numerator / denominator) / -np.log(numerator) else: return np.log(numerator / denominator)
k1 = occurences[str(pair[0])] k2 = occurences[str(pair[1])] k1k2 = k1.intersection(k2) denominator = len(k2) + e / n return np.log(numerator / denominator)
self.sparse_bow = sparse_bow self.type_dictionary = type_dictionary
segmented_topics = self.segment_topics() for topic in segmented_topics: pmi = [] for pair in topic: pmi.append(self.pmi_umass(pair=pair, occurences=occurences, e=e)) if mean: scores.append((2 / (N * (N - 1))) * np.mean(pmi)) else: return pd.Series(scores)
segmented_topics = self.segment_topics(permutation=True) occurences = self.calculate_occurences(bigrams=segmented_topics) for topic in segmented_topics: pmi = [] for pair in topic: pmi.append(self.pmi_uci(pair=pair, occurences=occurences, normalize=normalize, e=e)) if mean: scores.append((2 / (N * (N - 1))) * np.mean(pmi)) else: scores.append((2 / (N * (N - 1))) * np.median(pmi)) return pd.Series(scores) |