Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

""" 

Measuring and Evaluating Semantic Coherence of Topics 

===================================================== 

 

Topic Models generate probability distributions for words out of a collection of 

texts, sorting many single word distributions into distinct semantic groups 

called _topics_. These topics constitute groups of semantically related words. 

This module provides a method to evaluate the topics quantitatively by semantic 

coherence. 

""" 

 

from itertools import permutations, combinations 

import numpy as np 

import pandas as pd 

 

 

class Preparation: 

""" 

Preparation for coherence measures. 

""" 

 

def __init__(self, topics, sparse_bow, type_dictionary): 

""" 

Creates objects for topics, sparse_bow and type_dictionary. 

 

Args: 

topics (pd.DataFrame): A DataFrame containing topic keys. 

sparse_bow (pd.DataFrame): A DataFrame containing MultiIndex with 

`doc_id` and `type_id` and word frequencies. 

type_dictionary (dict): A dictionary containing types as key and 

IDs as values. 

""" 

self.topics = topics 

self.sparse_bow = sparse_bow 

self.type_dictionary = type_dictionary 

 

def segment_topics(self, permutation=False): 

""" 

Combines or permutes topic keys to bigrams after translating to their IDs. 

 

Args: 

permutation (bool): True for permutation. If False topic keys will be 

combined. Defaults to False. 

 

Returns: 

Series containing bigrams. 

""" 

bigrams = [] 

for topic in self.topics.iterrows(): 

topic = [token2bow(token, self.type_dictionary) 

for token in topic[1]] 

if permutation: 

bigrams.append(list(permutations(topic, 2))) 

else: 

bigrams.append(list(combinations(topic, 2))) 

return pd.Series(bigrams) 

 

def calculate_occurences(self, bigrams): 

""" 

Counts for each token ID all documents containing the ID 

 

Args: 

bigrams (pd.Series): Series containing bigrams of combined or permuted 

token IDs. 

 

Returns: 

Series containing document IDs for each token ID. 

""" 

bow = self.sparse_bow.reset_index(level=1)['token_id'] 

occurences = pd.Series() 

if isinstance(bigrams, set): 

pass 

else: 

keys = set() 

for topic in bigrams: 

for bigram in topic: 

keys.add(bigram[0]) 

keys.add(bigram[1]) 

for key in keys: 

total = set() 

for doc in bow.groupby(level=0): 

if key in doc[1].values: 

total.add(doc[0]) 

occurences[str(key)] = total 

return occurences 

 

 

class Measures(Preparation): 

""" 

Containing PMI measures 

""" 

 

def __init__(self, sparse_bow, type_dictionary): 

""" 

Creates objects for sparse_bow and type_dictionary. 

 

Args: 

topics (pd.DataFrame): A DataFrame containing topic keys. 

sparse_bow (pd.DataFrame): A DataFrame containing MultiIndex with 

`doc_id` and `type_id` and word frequencies. 

type_dictionary (dict): A dictionary containing types as key and 

IDs as values. 

""" 

self.type_dictionary = type_dictionary 

self.sparse_bow = sparse_bow 

 

def pmi_uci(self, pair, occurences, e=0.1, normalize=False): 

""" 

Calculates PMI (UCI) for token pair. This variant of PMI is based on 

Newman et al. 2010 Automatic Evaluation of Topic Coherence. 

 

Args: 

pair (tuple): Tuple containing two tokens, e.g. ('token1', 'token2') 

occurences (pd.Series): Series containing document occurences. 

e (float): Integer to avoid zero division. 

normalize (bool): If True, PMI (UCI) will be normalized. Defaults to 

False. 

 

Returns: 

Integer. 

""" 

n = len(self.sparse_bow.index.levels[0]) 

try: 

k1 = occurences[str(pair[0])] 

except KeyError: 

pass 

try: 

k2 = occurences[str(pair[1])] 

except KeyError: 

pass 

try: 

k1k2 = k1.intersection(k2) 

numerator = (len(k1k2) + e) / n 

denominator = ((len(k1) + e) / n) * ((len(k2) + e) / n) 

if normalize: 

return np.log(numerator / denominator) / -np.log(numerator) 

else: 

return np.log(numerator / denominator) 

except UnboundLocalError: 

pass 

 

def pmi_umass(self, pair, occurences, e=0.1): 

""" 

Calculates PMI (UMass) for token pair. This variant of PMI is based on 

Mimno et al. 2011 Optimizing Semantic Coherence in Topic Models. 

 

Args: 

pair (tuple): Tuple containing two tokens, e.g. ('token1', 'token2') 

occurences (pd.Series): Series containing document occurences. 

e (float): Integer to avoid zero division. 

 

Returns: 

Integer. 

""" 

n = len(self.sparse_bow.count(level=0)) 

try: 

k1 = occurences[str(pair[0])] 

except KeyError: 

pass 

try: 

k2 = occurences[str(pair[1])] 

except KeyError: 

pass 

try: 

k1k2 = k1.intersection(k2) 

numerator = (len(k1k2) + e) / n 

denominator = (len(k2) + e) / n 

return np.log(numerator / denominator) 

except UnboundLocalError: 

pass 

 

 

class Evaluation(Measures): 

def __init__(self, topics, sparse_bow, type_dictionary): 

""" 

Creates objects for topics, sparse_bow and type_dictionary. 

 

Args: 

topics (pd.DataFrame): A DataFrame containing topic keys. 

sparse_bow (pd.DataFrame): A DataFrame containing MultiIndex with 

`doc_id` and `type_id` and word frequencies. 

type_dictionary (dict): A dictionary containing types as key and 

IDs as values. 

""" 

self.topics = topics 

self.sparse_bow = sparse_bow 

self.type_dictionary = type_dictionary 

 

def calculate_umass(self, mean=True, e=0.1): 

""" 

Calculates PMI (UMass) for all topic keys in a DataFrame. This variant of 

PMI is based on Mimno et al. 2011 Optimizing Semantic Coherence in Topic Models. 

 

Args: 

mean (bool): If True, mean will be calculated for each topic, if 

False, median. Defaults to True. 

e (float): Integer to avoid zero division. 

 

Returns: 

Series with score for each topic. 

""" 

scores = [] 

N = len(self.topics.T) 

segmented_topics = self.segment_topics() 

occurences = self.calculate_occurences(bigrams=segmented_topics) 

for topic in segmented_topics: 

pmi = [] 

for pair in topic: 

pmi.append(self.pmi_umass( 

pair=pair, occurences=occurences, e=e)) 

if mean: 

scores.append((2 / (N * (N - 1))) * np.mean(pmi)) 

else: 

scores.append((2 / (N * (N - 1))) * np.median(pmi)) 

return pd.Series(scores) 

 

def calculate_uci(self, mean=True, normalize=False, e=0.1): 

""" 

Calculates PMI (UCI) for all topic keys in a DataFrame. This variant of 

PMI is based on Newman et al. 2010 Automatic Evaluation of Topic Coherence. 

 

Args: 

mean (bool): If True, mean will be calculated for each topic, if 

False, median. Defaults to True. 

normalize (bool): If True, PMI (UCI) will be normalized. Defaults to 

False. 

e (float): Integer to avoid zero division. 

 

Returns: 

Series with score for each topic. 

""" 

scores = [] 

N = len(self.topics.T) 

segmented_topics = self.segment_topics(permutation=True) 

occurences = self.calculate_occurences(bigrams=segmented_topics) 

for topic in segmented_topics: 

pmi = [] 

for pair in topic: 

pmi.append(self.pmi_uci( 

pair=pair, occurences=occurences, normalize=normalize, e=e)) 

if mean: 

scores.append((2 / (N * (N - 1))) * np.mean(pmi)) 

else: 

scores.append((2 / (N * (N - 1))) * np.median(pmi)) 

return pd.Series(scores)