Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

# -*- coding: utf-8 -*- 

 

""" 

Measuring and Evaluating Semantic Coherence of Topics 

===================================================== 

 

Topic Models generate probability distributions for words out of a collection of 

texts, sorting many single word distributions into distinct semantic groups 

called _topics_. These topics constitute groups of semantically related words. 

This module provides a method to evaluate the topics quantitatively by semantic 

coherence. 

""" 

 

__author__ = "DARIAH-DE" 

__email__ = "severin.simmler@stud-mail.uni-wuerzburg.de" 

 

from itertools import permutations, combinations 

import numpy as np 

import pandas as pd 

 

 

def read_sparse_bow(path): 

"""Opens sparse_bow from CSV-file. 

 

Description: 

With this function you can read a CSV-file containing a sparse_bow. 

Use the function `create_sparse_bow()` of the `preprocessing`-module 

to create a sparse_bow. 

 

Args: 

path (str): Path to CSV-file. 

 

Returns: 

DataFrame. 

""" 

sparse_bow = pd.read_csv(path) 

sparse_bow = sparse_bow.set_index(['doc_id', 'token_id']) 

return sparse_bow 

 

 

def read_dictionary(path): 

"""Opens type dictionary from CSV-file. 

 

Description: 

With this function you can read a CSV-file containing a a type dictionary. 

Use the function `create_dictionary()` of the `preprocessing`-module 

to create a type dictionary. 

 

Args: 

path (str): Path to CSV-file. 

 

Returns: 

DataFrame. 

""" 

dictionary = pd.read_csv(path, header=None) 

dictionary.index = dictionary[0] 

dictionary = dictionary[1] 

return dictionary.to_dict() 

 

 

def token2bow(token, type_dictionary): 

"""Adds token to type dictionary. 

 

Description: 

With this function you can 

* get the ID for a token in an existing type dictionary, if token is 

is already in type dictionary 

* get the ID for a token after adding the token to type dictionary, 

if token was not part of the dictionary 

 

Args: 

token (str): Token. 

type_dictionary (dict): Dictionary containing tokens as keys and integers 

as values. 

 

Returns: 

ID (int) of token. 

""" 

try: 

return type_dictionary[token] 

except KeyError: 

type_dictionary[token] = len(type_dictionary) + 1 

return type_dictionary[token] 

 

 

class Preparation: 

""" 

Preparation for coherence measures 

""" 

 

def __init__(self, topics, sparse_bow, type_dictionary): 

""" 

Creates objects for topics, sparse_bow and type_dictionary. 

 

Args: 

topics (pd.DataFrame): A DataFrame containing topic keys. 

sparse_bow (pd.DataFrame): A DataFrame containing MultiIndex with 

`doc_id` and `type_id` and word frequencies. 

type_dictionary (dict): A dictionary containing types as key and 

IDs as values. 

""" 

self.topics = topics 

self.sparse_bow = sparse_bow 

self.type_dictionary = type_dictionary 

 

def segment_topics(self, permutation=False): 

""" 

Combines or permutes topic keys to bigrams after translating to their IDs. 

 

Args: 

permutation (bool): True for permutation. If False topic keys will be 

combined. Defaults to False. 

 

Returns: 

Series containing bigrams. 

""" 

bigrams = [] 

for topic in self.topics.iterrows(): 

topic = [token2bow(token, self.type_dictionary) 

for token in topic[1]] 

if permutation: 

bigrams.append(list(permutations(topic, 2))) 

else: 

bigrams.append(list(combinations(topic, 2))) 

return pd.Series(bigrams) 

 

def calculate_occurences(self, bigrams): 

""" 

Counts for each token ID all documents containing the ID 

 

Args: 

bigrams (pd.Series): Series containing bigrams of combined or permuted 

token IDs. 

 

Returns: 

Series containing document IDs for each token ID. 

""" 

bow = self.sparse_bow.reset_index(level=1)['token_id'] 

occurences = pd.Series() 

if isinstance(bigrams, set): 

pass 

else: 

keys = set() 

for topic in bigrams: 

for bigram in topic: 

keys.add(bigram[0]) 

keys.add(bigram[1]) 

for key in keys: 

total = set() 

for doc in bow.groupby(level=0): 

if key in doc[1].values: 

total.add(doc[0]) 

occurences[str(key)] = total 

return occurences 

 

 

class Measures(Preparation): 

""" 

Containing PMI measures 

""" 

 

def __init__(self, sparse_bow, type_dictionary): 

""" 

Creates objects for sparse_bow and type_dictionary. 

 

Args: 

topics (pd.DataFrame): A DataFrame containing topic keys. 

sparse_bow (pd.DataFrame): A DataFrame containing MultiIndex with 

`doc_id` and `type_id` and word frequencies. 

type_dictionary (dict): A dictionary containing types as key and 

IDs as values. 

""" 

self.type_dictionary = type_dictionary 

self.sparse_bow = sparse_bow 

 

def pmi_uci(self, pair, occurences, e=0.1, normalize=False): 

""" 

Calculates PMI (UCI) for token pair. This variant of PMI is based on 

Newman et al. 2010 Automatic Evaluation of Topic Coherence. 

 

Args: 

pair (tuple): Tuple containing two tokens, e.g. ('token1', 'token2') 

occurences (pd.Series): Series containing document occurences. 

e (float): Integer to avoid zero division. 

normalize (bool): If True, PMI (UCI) will be normalized. Defaults to 

False. 

 

Returns: 

Integer. 

""" 

n = len(self.sparse_bow.index.levels[0]) 

try: 

k1 = occurences[str(pair[0])] 

except KeyError: 

print("%s nicht" % pair[0]) 

try: 

k2 = occurences[str(pair[1])] 

except KeyError: 

print("%s nicht" % pair[1]) 

try: 

k1k2 = k1.intersection(k2) 

numerator = (len(k1k2) + e) / n 

denominator = ((len(k1) + e) / n) * ((len(k2) + e) / n) 

if normalize: 

return np.log(numerator / denominator) / -np.log(numerator) 

else: 

return np.log(numerator / denominator) 

except UnboundLocalError: 

pass 

 

def pmi_umass(self, pair, occurences, e=0.1): 

""" 

Calculates PMI (UMass) for token pair. This variant of PMI is based on 

Mimno et al. 2011 Optimizing Semantic Coherence in Topic Models. 

 

Args: 

pair (tuple): Tuple containing two tokens, e.g. ('token1', 'token2') 

occurences (pd.Series): Series containing document occurences. 

e (float): Integer to avoid zero division. 

 

Returns: 

Integer. 

""" 

n = len(self.sparse_bow.count(level=0)) 

try: 

k1 = occurences[str(pair[0])] 

except KeyError: 

print("%s nicht" % pair[0]) 

try: 

k2 = occurences[str(pair[1])] 

except KeyError: 

print("%s nicht" % pair[1]) 

try: 

k1k2 = k1.intersection(k2) 

numerator = (len(k1k2) + e) / n 

denominator = (len(k2) + e) / n 

return np.log(numerator / denominator) 

except UnboundLocalError: 

pass 

 

 

class Evaluation(Measures): 

def __init__(self, topics, sparse_bow, type_dictionary): 

""" 

Creates objects for topics, sparse_bow and type_dictionary. 

 

Args: 

topics (pd.DataFrame): A DataFrame containing topic keys. 

sparse_bow (pd.DataFrame): A DataFrame containing MultiIndex with 

`doc_id` and `type_id` and word frequencies. 

type_dictionary (dict): A dictionary containing types as key and 

IDs as values. 

""" 

self.topics = topics 

self.sparse_bow = sparse_bow 

self.type_dictionary = type_dictionary 

 

def calculate_umass(self, mean=True, e=0.1): 

""" 

Calculates PMI (UMass) for all topic keys in a DataFrame. This variant of 

PMI is based on Mimno et al. 2011 Optimizing Semantic Coherence in Topic Models. 

 

Args: 

mean (bool): If True, mean will be calculated for each topic, if 

False, median. Defaults to True. 

e (float): Integer to avoid zero division. 

 

Returns: 

Series with score for each topic. 

""" 

scores = [] 

N = len(self.topics.T) 

segmented_topics = self.segment_topics() 

occurences = self.calculate_occurences(bigrams=segmented_topics) 

for topic in segmented_topics: 

pmi = [] 

for pair in topic: 

pmi.append(self.pmi_umass( 

pair=pair, occurences=occurences, e=e)) 

if mean: 

scores.append((2 / (N * (N - 1))) * np.mean(pmi)) 

else: 

scores.append((2 / (N * (N - 1))) * np.median(pmi)) 

return pd.Series(scores) 

 

def calculate_uci(self, mean=True, normalize=False, e=0.1): 

""" 

Calculates PMI (UCI) for all topic keys in a DataFrame. This variant of 

PMI is based on Newman et al. 2010 Automatic Evaluation of Topic Coherence. 

 

Args: 

mean (bool): If True, mean will be calculated for each topic, if 

False, median. Defaults to True. 

normalize (bool): If True, PMI (UCI) will be normalized. Defaults to 

False. 

e (float): Integer to avoid zero division. 

 

Returns: 

Series with score for each topic. 

""" 

scores = [] 

N = len(self.topics.T) 

segmented_topics = self.segment_topics(permutation=True) 

occurences = self.calculate_occurences(bigrams=segmented_topics) 

for topic in segmented_topics: 

pmi = [] 

for pair in topic: 

pmi.append(self.pmi_uci( 

pair=pair, occurences=occurences, normalize=normalize, e=e)) 

if mean: 

scores.append((2 / (N * (N - 1))) * np.mean(pmi)) 

else: 

scores.append((2 / (N * (N - 1))) * np.median(pmi)) 

return pd.Series(scores)