Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

419

420

421

422

423

424

425

426

427

428

429

430

431

432

433

434

435

436

437

438

439

440

441

442

443

444

445

446

447

448

449

450

451

452

453

454

455

456

457

458

459

460

461

462

463

464

465

466

467

468

469

470

471

472

473

474

475

476

477

478

479

480

481

482

483

484

485

486

487

488

489

490

491

492

493

494

495

496

497

498

499

500

501

502

503

504

505

506

507

508

509

510

511

512

513

514

515

516

517

518

519

520

521

522

523

524

525

526

527

528

529

530

531

532

533

534

535

536

537

538

539

540

541

542

543

544

545

546

547

548

549

550

551

552

553

554

555

556

557

558

559

560

561

562

563

564

565

566

567

568

569

570

571

# -*- coding: utf-8 -*- 

 

""" 

Visualizing the Output of LDA Models 

==================================== 

 

 

""" 

 

__author__ = "DARIAH-DE" 

__authors__ = "Steffen Pielstroem, Sina Bock, Severin Simmler" 

__email__ = "pielstroem@biozentrum.uni-wuerzburg.de" 

__version__ = "0.1" 

__date__ = "2017-01-20" 

 

 

import logging 

import matplotlib 

matplotlib.use('Agg') 

import matplotlib.pyplot as plt 

import numpy as np 

import os 

import pandas as pd 

import regex 

from collections import defaultdict 

 

log = logging.getLogger('visualization') 

log.addHandler(logging.NullHandler()) 

logging.basicConfig(level = logging.ERROR, 

format = '%(levelname)s %(name)s: %(message)s') 

 

def create_doc_topic(corpus, model, doc_labels): 

# Adapted from code by Stefan Pernes 

"""Creates a document-topic-matrix. 

 

Description: 

With this function you can create a doc-topic-maxtrix for gensim  

output.  

 

Args: 

corpus (mmCorpus): Gensim corpus. 

model: Gensim LDA model 

doc_labels (list): List of document labels. 

 

Returns:  

Doc_topic-matrix as DataFrame 

 

ToDo: 

 

Example: 

>>> import gensim 

>>> corpus = [[(1, 0.5)], []] 

>>> gensim.corpora.MmCorpus.serialize('/tmp/corpus.mm', corpus) 

>>> mm = gensim.corpora.MmCorpus('/tmp/corpus.mm') 

>>> type2id = {0 : "test", 1 : "corpus"} 

>>> doc_labels = ['doc1', 'doc2'] 

>>> model = gensim.models.LdaModel(corpus=mm, id2word=type2id, num_topics=1) 

>>> doc_topic = visualization.create_doc_topic(corpus, model, doc_labels) 

>>> len(doc_topic.T) == 2 

>>> True 

""" 

log.info("Generating doc_topic_matrix ... ") 

no_of_topics = model.num_topics 

no_of_docs = len(doc_labels) 

doc_topic = np.zeros((no_of_topics, no_of_docs)) 

 

for doc, i in zip(corpus, range(no_of_docs)): # use document bow from corpus 

topic_dist = model.__getitem__(doc) # to get topic distribution from model 

for topic in topic_dist: # topic_dist is a list of tuples 

doc_topic[topic[0]][i] = topic[1] # save topic probability 

 

topic_labels = [] 

for i in range(no_of_topics): 

topic_terms = [x[0] for x in model.show_topic(i, topn=3)] # show_topic() returns tuples (word_prob, word) 

topic_labels.append(" ".join(topic_terms)) 

 

doc_topic = pd.DataFrame(doc_topic, index = topic_labels, columns = doc_labels) 

 

return doc_topic 

 

def doc_topic_heatmap(data_frame): 

# Adapted from code by Stefan Pernes and Allen Riddell 

"""Plot documnet-topic distribution in a heat map. 

 

Description: 

Use create_doc_topic() to generate a doc-topic 

 

Args: 

data_frame (DataFrame): Document-topic-matrix. 

 

Returns:  

Plot with Heatmap 

 

ToDo: 

 

Example: 

>>> import gensim 

>>> corpus = [[(1, 0.5)], []] 

>>> gensim.corpora.MmCorpus.serialize('/tmp/corpus.mm', corpus) 

>>> mm = gensim.corpora.MmCorpus('/tmp/corpus.mm') 

>>> type2id = {0 : "test", 1 : "corpus"} 

>>> doc_labels = ['doc1', 'doc2'] 

>>> model = gensim.models.LdaModel(corpus=mm, id2word=type2id, num_topics=1) 

>>> doc_topic = visualization.create_doc_topic(corpus, model, doc_labels) 

>>> plot = doc_topic_heatmap(doc_topic) 

>>> plot.get_fignumns() 

[1] 

""" 

log.info("Generating doc_topic_heatmap ... ") 

data_frame = data_frame.sort_index() 

doc_labels = list(data_frame.index) 

topic_labels = list(data_frame) 

if len(doc_labels) > 20 or len(topic_labels) > 20: plt.figure(figsize=(10,10)) # if many items, enlarge figure 

plt.pcolor(data_frame, norm=None, cmap='Reds') 

plt.yticks(np.arange(data_frame.shape[0])+1.0, doc_labels) 

plt.xticks(np.arange(data_frame.shape[1])+0.5, topic_labels, rotation='90') 

plt.gca().invert_yaxis() 

plt.tight_layout() 

 

#plt.savefig(path+"/"+corpusname+"_heatmap.png") #, dpi=80) 

return plt 

 

 

def plot_doc_topics(doc_topic, document_index): 

"""Plot topic disctribution in a document. 

 

Description: 

 

 

Args: 

Document-topic data frame. 

Index of the document to be shown. 

 

Returns: 

Plot. 

 

Example: 

>>> import gensim 

>>> corpus = [[(1, 0.5)], []] 

>>> gensim.corpora.MmCorpus.serialize('/tmp/corpus.mm', corpus) 

>>> mm = gensim.corpora.MmCorpus('/tmp/corpus.mm') 

>>> type2id = {0 : "test", 1 : "corpus"} 

>>> doc_labels = ['doc1', 'doc2'] 

>>> model = gensim.models.LdaModel(corpus=mm, id2word=type2id, num_topics=1) 

>>> doc_topic = visualization.create_doc_topic(corpus, model, doc_labels) 

>>> plot = visualization.plot_doc_topics(doc_topics, 0) 

>>> plot.get_fignumns() 

[1] 

 

""" 

log.info("Calculating topic distribution ... ") 

data = doc_topic[list(doc_topic)[document_index]].copy() 

data = data[data != 0] 

data = data.sort_values() 

values = list(data) 

labels = list(data.index) 

 

plt.barh(range(len(values)), values, align = 'center', alpha=0.5) 

plt.yticks(range(len(values)), labels) 

plt.title(list(doc_topic)[document_index]) 

plt.xlabel('Proportion') 

plt.ylabel('Topic') 

plt.tight_layout() 

return plt 

 

 

try: 

from wordcloud import WordCloud 

 

# 

# Work in progress following 

# 

def topicwords_in_df(model): 

"""Read Keywords for each topic 

 

Args: 

model: Gensim LDA model 

 

Note: Work in progress 

 

ToDo:  

Check if this function should be implemented  

and complete docstring 

 

Returns:  

Pandas DataFrame 

 

""" 

pattern = regex.compile(r'\p{L}+\p{P}?\p{L}+') 

topics = [] 

index = [] 

log.info("Get keywords for topic ...") 

for n, topic in enumerate(model.show_topics()): 

topics.append(pattern.findall(topic[1])) 

index.append("Topic " + str(n+1)) 

df = pd.DataFrame(topics, index=index, columns=["Key " + str(x+1) for x in range(len(topics))]) 

return df 

 

def show_wordle_for_topic(model, topic_nr, words): 

"""Plot wordle for a specific topic 

 

Args: 

model: Gensim LDA model 

topic_nr(int): Choose topic 

words (int): Number of words to show 

 

Note:  

Work in progress 

Function does use wordcloud package -> https://pypi.python.org/pypi/wordcloud 

pip install wordcloud. 

 

ToDo:  

Check if this function should be implemented. 

 

Returns: 

Plot. 

 

""" 

plt.figure() 

plt.imshow(WordCloud().fit_words(dict(model.show_topic(topic_nr, words)))) 

plt.axis("off") 

plt.title("Topic #" + str(topic_nr + 1)) 

return plt 

 

 

def get_color_scale(word, font_size, position, orientation, font_path, random_state=None): 

""" Create color scheme for wordle. 

 

Description: 

 

 

Args: 

 

Note:  

Work in progress 

 

ToDo:  

Check if this function should be implemented.  

 

Returns: 

 

""" 

return "hsl(245, 58%, 25%)" # Default. Uniform dark blue. 

#return "hsl(0, 00%, %d%%)" % random.randint(80, 100) # Greys for black background. 

#return "hsl(221, 65%%, %d%%)" % random.randint(30, 35) # Dark blues for white background 

 

def get_topicRank(topic, topicRanksFile): 

""" Add ranking to topics 

 

Description: 

Uses topicRanksFile to add ranking to topics 

 

Args: 

topic(int): Number of topic. 

topicRanksFile(str): Path to topicRanksFile 

 

Returns:  

Rank of choosen topic 

 

Note: 

Work in progress 

 

ToDo: 

Check if this function should be implemented. 

 

""" 

assert topicRanksFile 

 

log.info("Add ranking ...") 

with open(topicRanksFile, "r") as infile: 

topicRanks = pd.read_csv(infile, sep=",", index_col=0) 

rank = int(topicRanks.iloc[topic]["Rank"]) 

return rank 

 

def read_mallet_word_weights(word_weights_file): 

"""Read Mallet word_weigths file 

 

Description: 

Reads Mallet word_weigths into pandas DataFrame. 

 

Args: 

word_weigts_file: Word_weights_file created with Mallet 

 

Returns:  

Pandas DataFrame 

 

Note: 

Work in progress 

 

ToDo: 

 

""" 

assert word_weights_file 

log.info("Get word weights ...") 

word_scores = pd.read_table(word_weights_file, header=None, sep="\t") 

word_scores = word_scores.sort(columns=[0,2], axis=0, ascending=[True, False]) 

word_scores_grouped = word_scores.groupby(0) 

return word_scores_grouped 

 

def _get_wordcloudwords(word_scores_grouped, number_of_top_words, topic_nr): 

"""Transform Mallet output for wordcloud generation. 

 

Description: 

Get words for wordcloud. 

 

Args: 

word_scores_grouped(DataFrame): Uses read_mallet_word_weights() to get 

grouped word scores. 

topic_nr(int): Topic the wordcloud should be generated for 

number_of_top_words(int): Number of top words that should be considered 

 

Returns:  

Words for wordcloud. 

 

Note: 

Work in progress 

 

ToDo: 

 

""" 

log.info("Transform mallet output for wordcloud ...") 

topic_word_scores = word_scores_grouped.get_group(topic_nr) 

top_topic_word_scores = topic_word_scores.iloc[0:number_of_top_words] 

topic_words = top_topic_word_scores.loc[:,1].tolist() 

#word_scores = top_topic_word_scores.loc[:,2].tolist() 

wordcloudwords = "" 

j = 0 

for word in topic_words: 

word = word 

#score = word_scores[j] 

j += 1 

wordcloudwords = wordcloudwords + ((word + " ")) 

return wordcloudwords 

 

def plot_wordcloud_from_mallet(word_weights_file, 

topic_nr, 

number_of_top_words, 

outfolder, 

dpi): 

"""Generate wordclouds from Mallet output. 

 

Description: 

This function does use the wordcloud module to plot wordclouds. 

Uses read_mallet_word_weigths() and get_wordlewords() to get 

word_scores and words for wordclouds. 

 

Args: 

word_weigts_file: Word_weights_file created with Mallet 

topic_nr(int): Topic the wordclouds should be generated for 

number_of_top_words(int): Number of top words that should be considered 

for the wordclouds 

outfolder(str): Specify path to safe wordclouds. 

dpi(int): Set resolution for wordclouds. 

 

Returns:  

Plot 

 

Note: 

Work in progress 

 

ToDo: 

 

""" 

assert word_weights_file 

log.info("Generate wordcloud...") 

word_scores_grouped = read_mallet_word_weights(word_weights_file) 

text = _get_wordcloudwords(word_scores_grouped, number_of_top_words, topic_nr) 

wordcloud = WordCloud(width=600, height=400, background_color="white", margin=4).generate(text) 

default_colors = wordcloud.to_array() 

figure_title = "topic "+ str(topic_nr) 

plt.imshow(default_colors) 

plt.imshow(wordcloud) 

plt.title(figure_title, fontsize=30) 

plt.axis("off") 

 

## Saving the image file. 

if not os.path.exists(outfolder): 

os.makedirs(outfolder) 

 

figure_filename = "wordcloud_tp"+"{:03d}".format(topic_nr) + ".png" 

assert figure_filename 

plt.savefig(outfolder + figure_filename, dpi=dpi) 

return plt 

 

def plot_wordle_from_lda(model, vocab, topic_nr, words, height, width): 

""" Plot wordle for Gensim. 

 

Description: 

 

 

Args: 

model: Gensim lda model. 

vocab: 

topic_nr(int): Topic a wordcloud should be generated for. 

height(int): Height of the plot 

weight(int): Weight of the plot 

 

Note:  

Work in progress 

 

ToDo:  

Check if this function should be implemented.  

 

Returns: 

 

""" 

log.info("Generate wordcloud...") 

topic_dist = model.topic_word_[topic_nr] 

topic_words = np.array(vocab)[np.argsort(topic_dist)][:-words:-1] 

token_value = {} 

for token, value in zip(topic_words, topic_dist[:-words:-1]): 

token_value.update({token: value}) 

return WordCloud(background_color='white', height=height, width=width).fit_words(token_value) 

 

except ImportError as e: 

log.error('WordCloud functions not available, they require the wordcloud module') 

 

 

def doc_topic_heatmap_interactive(doc_topic, title): 

"""Plot interactive doc_topic_heatmap 

 

Description: 

With this function you can plot an interactive doc_topic matrix. 

 

Args: 

doc_topic (DataFrame): Doc_topic matrix in a DataFrame 

title (str): Title shown in the plot. 

 

Returns:  

bokeh plot 

 

Note: 

 

ToDo: 

Doctest 

 

""" 

log.info("Importing functions from bokeh ...") 

try: 

#from ipywidgets import interact 

from bokeh.io import output_notebook 

from bokeh.plotting import figure 

from math import pi 

from bokeh.models import ( 

ColumnDataSource, 

HoverTool, 

LinearColorMapper, 

BasicTicker, 

ColorBar 

) 

 

output_notebook() 

 

documents = list(doc_topic.columns) 

topics = doc_topic.index 

 

score = [] 

for x in doc_topic.apply(tuple): 

score.extend(x) 

data = { 

'Topic': list(doc_topic.index) * len(doc_topic.columns), 

'Document': [item for item in list(doc_topic.columns) for i in range(len(doc_topic.index))], 

'Score': score 

} 

 

df = doc_topic.from_dict(data) 

 

colors = ["#c6dbef", "#9ecae1", "#6baed6", "#4292c6", "#2171b5", "#08519c", "#08306b"] 

mapper = LinearColorMapper(palette=colors, low=df.Score.min(), high=df.Score.max()) 

 

source = ColumnDataSource(df) 

 

TOOLS = "hover,save,pan,box_zoom,reset,wheel_zoom" 

 

p = figure(title=title, 

x_range=documents, y_range=list(reversed(topics)), 

x_axis_location="above", plot_width=1024, plot_height=768, 

tools=TOOLS, toolbar_location='below', responsive=True) 

 

p.grid.grid_line_color = None 

p.axis.axis_line_color = None 

p.axis.major_tick_line_color = None 

p.axis.major_label_text_font_size = "9pt" 

p.axis.major_label_standoff = 0 

p.xaxis.major_label_orientation = pi / 3 

 

p.rect(x="Document", y="Topic", width=1, height=1, 

source=source, 

fill_color={'field': 'Score', 'transform': mapper}, 

line_color=None) 

 

 

color_bar = ColorBar(color_mapper=mapper, major_label_text_font_size="10pt", 

ticker=BasicTicker(desired_num_ticks=len(colors)), 

label_standoff=6, border_line_color=None, location=(0, 0)) 

 

p.add_layout(color_bar, 'right') 

 

p.select_one(HoverTool).tooltips = [ 

('Document', '@Document'), 

('Topic', '@Topic'), 

('Score', '@Score') 

] 

return p 

 

except: 

log.info("Bokeh could not be imported now using mathplotlib") 

doc_topic_heatmap(doc_topic) 

 

p.add_layout(color_bar, 'right') 

 

p.select_one(HoverTool).tooltips = [ 

('Document', '@Document'), 

('Topic', '@Topic'), 

('Score', '@Score') 

] 

return p 

 

 

 

def show_topic_over_time(doc_topics, pattern = r"\d{4}", threshold=0.1, starttime=1841, endtime=1920): 

"""Creates a visualization that shows topics over time 

 

Description: 

With this function you can plot topics over time using metadata stored in the documents name. 

Only works with mallet output. 

 

Args: 

doc_topic: doc-topic matrix created by mallet.show_doc_topic_matrix 

labels(list[str]): first three keys in a topic to select 

threshold(float): threshold set to define if a topic in a document is viable 

starttime(int): sets starting point for visualization 

endtime(int): sets ending point for visualization 

 

 

Returns:  

matplotlib plot 

 

Note: this function is created for a corpus with filenames that looks like: 

1866_ArticleName.txt 

 

ToDo: make it compatible with gensim output 

Doctest 

 

""" 

years=list(range(starttime,endtime)) 

#doc_topicT = doc_topics.T 

topiclabels = [] 

for topiclabel in doc_topics.index.values: 

for topiclabel in topiclabels: 

topic_over_threshold_per_year = [] 

mask = doc_topics.loc[topiclabel] > threshold 

df = doc_topics.loc[topiclabel].loc[mask] 

#df = doc_topics.loc[doc_topics.loc[topiclabel] > threshold] 

#print (df) 

d = defaultdict(int) 

for item in df.index.values: 

reg = regex.compile(pattern) 

year = reg.findall(item) 

d[year[0]]+=1 

for year in years: 

topic_over_threshold_per_year.append(d[str(year)]) 

plt.plot(years, topic_over_threshold_per_year, label=topiclabel) 

 

plt.xlabel('Year') 

plt.ylabel('count topics over threshold') 

plt.legend() 

fig = plt.gcf() 

fig.set_size_inches(18.5, 10.5) 

plt.show() 

return fig