Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

419

420

421

422

423

424

425

426

427

428

429

430

431

432

433

434

435

436

437

438

439

440

441

442

443

444

445

446

447

448

449

450

451

452

453

454

455

456

457

458

459

460

461

462

463

464

465

466

467

468

469

470

471

472

473

474

475

476

477

478

479

480

481

482

483

484

485

486

487

488

489

490

491

492

493

494

495

496

497

498

499

500

501

502

503

504

505

506

507

508

509

510

511

512

513

514

515

516

517

518

519

520

521

522

523

524

525

526

527

#!/usr/bin/env python3 

# -*- coding: utf-8 -*- 

 

"""MALLET wrapper for Python. 

 

This module contains various `MALLET`_ related functions for Topic Modeling 

provided by `DARIAH-DE`_. 

 

.. _MALLET: 

http://mallet.cs.umass.edu/ 

.. _DARIAH-DE: 

https://de.dariah.eu 

https://github.com/DARIAH-DE 

""" 

 

__author__ = "DARIAH-DE" 

__authors__ = "Steffen Pielstroem, Sina Bock, Severin Simmler" 

__email__ = "pielstroem@biozentrum.uni-wuerzburg.de" 

 

import itertools 

import logging 

import numpy as np 

import operator 

import os 

import pandas as pd 

from platform import system 

from subprocess import Popen, PIPE 

 

log = logging.getLogger('mallet') 

log.addHandler(logging.NullHandler()) 

logging.basicConfig(level=logging.INFO, 

format='%(levelname)s %(name)s: %(message)s') 

 

 

def create_mallet_binary(path_to_mallet='mallet', path_to_file=False, 

path_to_corpus=False, output_file='mallet_output/binary.mallet', 

encoding=None, token_regex=None, preserve_case=False, 

remove_stopwords=True, stoplist=None, extra_stopwords=None, 

stop_pattern_file=None, skip_header=False, skip_html=False, 

replacement_files=None, deletion_files=None, gram_sizes=None, 

keep_sequence=True, keep_sequence_bigrams=False, 

binary_features=False, save_text_in_source=False, 

print_output=False): 

"""Creates a MALLET binary file. 

 

Description: 

bla 

 

Args: 

path_to_mallet (str): Path to MALLET. Defaults to 'mallet'. If MALLET is 

not properly installed, use absolute path, e.g. '/home/workspace/mallet/bin/mallet'. 

path_to_file (str): Absolute path to text file, e.g. '/home/workspace/testfile.txt'. 

path_to_corpus (str): Absolute path to corpus folder, e.g. '/home/workspace/corpus_txt'. 

output_file (str): Path to output plus filename, e.g. '/home/workspace/mallet_output/binary.mallet'. 

encoding (str): Character encoding for input file. Defaults to UTF-8. 

token_regex (str): Divides documents into tokens using a regular 

expression (supports Unicode regex). Defaults to \p{L}[\p{L}\p{P}]+\p{L}. 

preserve_case (bool): If false, converts all word features to lowercase. Defaults to False. 

remove_stopwords (bool): Ignores a standard list of very common English tokens. Defaults to True. 

stoplist (str): Absolute path to plain text stopword list. Defaults to None. 

extra_stopwords (str): Read whitespace-separated words from this file, 

and add them to either the default English stoplist or the list 

specified by `stoplist`. Defaults to None. 

stop_pattern_file (str): Read regular expressions from a file, one per 

line. Tokens matching these regexps will be removed. Defaults to None. 

skip_header (bool): If true, in each document, remove text occurring 

before a blank line. This is useful for removing email or UseNet 

headers. Defaults to False. 

skip_html (bool): If true, remove text occurring inside <...>, as in 

HTML or SGML. Defaults to False. 

replacement_files (str): Files containing string replacements, one per 

line: 'A B [tab] C' replaces A B with C, 'A B' replaces A B with A_B. 

Defaults to None. 

deletion_files (str): Files containing strings to delete after 

`replacements_files` but before tokenization (i.e. multiword stop 

terms). Defaults to False. 

gram_sizes (int): Include among the features all n-grams of sizes 

specified. For example, to get all unigrams and bigrams, use `gram_sizes=1,2`. 

This option occurs after the removal of stop words, if removed. 

Defaults to None. 

keep_sequence (bool): Preserves the document as a sequence of word features, 

rather than a vector of word feature counts. Use this option for sequence 

labeling tasks. MALLET also requires feature sequences rather than 

feature vectors. Defaults to True. 

keep_sequence_bigrams (bool): If true, final data will be a 

FeatureSequenceWithBigrams rather than a FeatureVector. Defaults to False. 

binary_features (bool): If true, features will be binary. Defaults to False. 

save_text_in_source (bool): If true, save original text of document in source. 

Defaults to False. 

print_output (bool): If true, print a representation of the processed data 

to standard output. This option is intended for debugging. Defaults to 

False. 

 

Returns: 

String. Absolute path to created MALLET binary file. 

""" 

if system() == 'Windows': 

shell = True 

else: 

shell = False 

 

if not os.path.exists(os.path.dirname(output_file)): 

os.makedirs(os.path.dirname(output_file)) 

 

param = [path_to_mallet] 

if not path_to_file: 

param.append('import-dir') 

param.append('--input') 

param.append(path_to_corpus) 

else: 

param.append('import-file') 

param.append('--input') 

param.append(path_to_file) 

if encoding is not None: 

param.append('--encoding') 

param.append(encoding) 

if token_regex is not None: 

param.append('--token-regex') 

param.append(token_regex) 

if preserve_case: 

param.append('--preserve-case') 

if remove_stopwords: 

param.append('--remove-stopwords') 

if stoplist is not None: 

param.append('--stoplist-file') 

param.append(stoplist) 

if extra_stopwords is not None: 

param.append('--extra-stopwords') 

param.append(extra_stopwords) 

if stop_pattern_file is not None: 

param.append('--stop-pattern-file') 

param.append(stop_pattern_file) 

if skip_header: 

param.append('--skip-header') 

if skip_html: 

param.append('--skip-html') 

if replacement_files is not None: 

param.append('--replacement-files') 

param.append(replacement_files) 

if deletion_files is not None: 

param.append('--deletion-files') 

param.append(deletion_files) 

if gram_sizes is not None: 

param.append('--gram-sizes') 

param.append(str(gram_sizes)) 

if keep_sequence: 

param.append('--keep-sequence') 

if keep_sequence_bigrams: 

param.append('--keep-sequence-bigrams') 

if binary_features: 

param.append('--binary-features') 

if save_text_in_source: 

param.append('--save-text-in-source') 

if print_output: 

param.append('--print-output') 

param.append('--output') 

param.append(output_file) 

 

try: 

log.info("Running MALLET with %s ...", param) 

log.info("Saving MALLET binary to %s ...", output_file) 

p = Popen(param, stdout=PIPE, stderr=PIPE, shell=shell) 

mallet_info = p.communicate()[0].decode('utf-8') 

if print_output: 

log.info(mallet_info) 

except KeyboardInterrupt: 

p.terminate() 

log.error(mallet_info) 

return output_file 

 

 

def create_mallet_model(path_to_mallet='mallet', path_to_binary=None, input_model=None, 

input_state=None, folder_for_output='mallet_output', 

output_model=True, output_model_interval=0, output_state=True, 

output_state_interval=0, inferencer_file=True, evaluator_file=True, 

output_topic_keys=True, topic_word_weights_file=True, 

word_topic_counts_file=True, diagnostics_file=True, xml_topic_report=True, 

xml_topic_phrase_report=True, output_topic_docs=False, num_top_docs=100, 

output_doc_topics=True, doc_topics_threshold=0.0, 

num_topics=10, num_top_words=10, 

num_iterations=1000, num_threads=1, num_icm_iterations=0, 

no_inference=False, random_seed=0, optimize_interval=0, 

optimize_burn_in=200, use_symmetric_alpha=False, alpha=5.0, 

beta=0.01): 

"""Creates MALLET model. 

 

Description: 

bla 

 

Args: 

path_to_mallet (str): Path to MALLET. Defaults to 'mallet'. If MALLET is 

not properly installed use absolute path, e.g. '/home/workspace/mallet/bin/mallet'. 

path_to_binary (str): Path to previously created MALLET binary. 

input_model (str): Absolute path to the binary topic model created by `output_model`. 

input_state (str): Absolute path to the gzipped Gibbs sampling state created by `output_state`. 

folder_for_output (str): Folder for MALLET output. 

output_model (bool): Write a serialized MALLET topic trainer object. 

This type of output is appropriate for pausing and restarting training, 

but does not produce data that can easily be analyzed. Defaults to True. 

output_model_interval (int): The number of iterations between writing the 

model (and its Gibbs sampling state) to a binary file. You must also 

set the `output_model` parameter to use this option, whose argument 

will be the prefix of the filenames. Defaults to 0. 

output_state (bool): Write a compressed text file containing the words 

in the corpus with their topic assignments. The file format can easily 

be parsed and used by non-Java-based software. Defaults to True. 

output_state_interval (int): The number of iterations between writing the 

sampling state to a text file. You must also set the `output_state` 

to use this option, whose argument will be the prefix of the filenames. 

Defaults to 0. 

inference_file (bool): A topic inferencer applies a previously trained 

topic model to new documents. Defaults to False. 

evaluator_file (bool): A held-out likelihood evaluator for new documents. 

Defaults to False. 

output_topic_keys (bool): Write the top words for each topic and any 

Dirichlet parameters. Defaults to True. 

topic_word_weights_file (bool): Write unnormalized weights for every 

topic and word type. Defaults to True. 

word_topic_counts_file (bool): Write a sparse representation of topic-word 

assignments. By default this is null, indicating that no file will 

be written. Defaults to True. 

diagnostics_file (bool): Write measures of topic quality, in XML format. 

Defaults to True. 

xml_topic_report (bool): Write the top words for each topic and any 

Dirichlet parameters in XML format. Defaults to True. 

xml_topic_phrase_report (bool): Write the top words and phrases for each 

topic and any Dirichlet parameters in XML format. Defaults to True. 

output_topic_docs (bool): Currently not available. Write the most prominent 

documents for each topic, at the end of the iterations. Defaults to False. 

num_top_docs (int): Currently not available. Number of top documents for 

`output_topic_docs`. Defaults to False. 

output_doc_topics (bool): Write the topic proportions per document, at 

the end of the iterations. Defaults to True. 

doc_topics_threshold (float): Do not print topics with proportions less 

than this threshold value within `output_doc_topics`. Defaults to 0.0. 

num_topics (int): Number of topics. Defaults to 10. 

num_top_words (int): Number of keywords for each topic. Defaults to 10. 

num_interations (int): Number of iterations. Defaults to 1000. 

num_threads (int): Number of threads for parallel training. Defaults to 1. 

num_icm_iterations (int): Number of iterations of iterated conditional 

modes (topic maximization). Defaults to 0. 

no_inference (bool): Load a saved model and create a report. Equivalent 

to `num_iterations = 0`. Defaults to False. 

random_seed (int): Random seed for the Gibbs sampler. Defaults to 0. 

optimize_interval (int): Number of iterations between reestimating 

dirichlet hyperparameters. Defaults to 0. 

optimize_burn_in (int): Number of iterations to run before first 

estimating dirichlet hyperparameters. Defaults to 200. 

use_symmetric_alpha (bool): Only optimize the concentration parameter of 

the prior over document-topic distributions. This may reduce the 

number of very small, poorly estimated topics, but may disperse common 

words over several topics. Defaults to False. 

alpha (float): Sum over topics of smoothing over doc-topic distributions. 

alpha_k = [this value] / [num topics]. Defaults to 5.0. 

beta (float): Smoothing parameter for each topic-word. Defaults to 0.01. 

 

 

ToDo: 

Param 'output_topic_docs' is causing an internal error 

-> Exception in thread "main" java.lang.ClassCastException: java.net.URI cannot be cast to java.lang.String 

-> at cc.mallet.topics.ParallelTopicModel.printTopicDocuments(ParallelTopicModel.java:1773) 

-> at cc.mallet.topics.tui.TopicTrainer.main(TopicTrainer.java:281) 

Param, 'num_top_docs' is obsolete, refering to 'output_topic_docs' 

 

Returns: 

Nothing. 

""" 

if system() == 'Windows': 

shell = True 

else: 

shell = False 

 

os.makedirs(folder_for_output, exist_ok=True) 

 

param = [path_to_mallet, 'train-topics'] 

if input_model is None: 

param.append('--input') 

else: 

param.append('--input-model') 

param.append(input_model) 

if path_to_binary is not None: 

param.append(path_to_binary) 

if input_state is not None: 

param.append('--input-state') 

param.append(input_state) 

 

log.debug("Choosing parameters ...") 

if num_topics is not False: 

param.append('--num-topics') 

param.append(str(num_topics)) 

if num_iterations is not False: 

param.append('--num-iterations') 

param.append(str(num_iterations)) 

if num_threads is not False: 

param.append('--num-threads') 

param.append(str(num_threads)) 

if num_top_words is not False: 

param.append('--num-top-words') 

param.append(str(num_top_words)) 

if num_icm_iterations is not False: 

param.append('--num-icm-iterations') 

param.append(str(num_icm_iterations)) 

if no_inference is not False: 

param.append('--no-inference') 

param.append(str(no_inference)) 

if random_seed is not False: 

param.append('--random-seed') 

param.append(str(random_seed)) 

 

log.debug("Choosing hyperparameters ...") 

if optimize_interval is not None: 

param.append('--optimize-interval') 

param.append(str(optimize_interval)) 

if optimize_burn_in is not None: 

param.append('--optimize-burn-in') 

param.append(str(optimize_burn_in)) 

if use_symmetric_alpha is not None: 

param.append('--use-symmetric-alpha') 

if alpha is not None: 

param.append('--alpha') 

param.append(str(alpha)) 

if beta is not None: 

param.append('--beta') 

param.append(str(beta)) 

 

log.debug("Choosing output parameters ...") 

if output_topic_keys: 

param.append('--output-topic-keys') 

param.append(os.path.join(folder_for_output, 'topic_keys.txt')) 

if output_doc_topics: 

param.append('--output-doc-topics') 

param.append(os.path.join(folder_for_output, 'doc_topics.txt')) 

if doc_topics_threshold is not None: 

param.append('--doc-topics-threshold') 

param.append(str(doc_topics_threshold)) 

if topic_word_weights_file: 

param.append('--topic-word-weights-file') 

param.append(os.path.join(folder_for_output, 'topic_word_weights.txt')) 

if word_topic_counts_file: 

param.append('--word-topic-counts-file') 

param.append(os.path.join(folder_for_output, 'word_topic_counts.txt')) 

if diagnostics_file: 

param.append('--diagnostics-file') 

param.append(os.path.join(folder_for_output, 'diagnostics.xml')) 

if xml_topic_report: 

param.append('--xml-topic-report') 

param.append(os.path.join(folder_for_output, 'topic_report.xml')) 

if xml_topic_phrase_report: 

param.append('--xml-topic-phrase-report') 

param.append(os.path.join(folder_for_output, 'topic_phrase_report.xml')) 

if output_model: 

param.append('--output-model') 

param.append(os.path.join(folder_for_output, 'mallet.model')) 

if output_model_interval is not None: 

param.append('--output-model-interval') 

param.append(str(output_model_interval)) 

if output_state: 

param.append('--output-state') 

param.append(os.path.join(folder_for_output, 'state.gz')) 

if output_state_interval is not None: 

param.append('--output-state-interval') 

param.append(str(output_state_interval)) 

if inferencer_file: 

param.append('--inferencer-filename') 

param.append(os.path.join(folder_for_output, 'inferencer')) 

if evaluator_file: 

param.append('--evaluator-filename') 

param.append(os.path.join(folder_for_output, 'evaluator')) 

 

# not yet working 

if output_topic_docs: 

param.append('--output-topic-docs') 

param.append(os.path.join(outfolder, 'topic_docs.txt')) 

if num_top_docs is not None: 

param.append('--num-top-docs') 

param.append(str(topic_word_weights_file)) 

 

try: 

log.info("Accessing Mallet with %s ...", param) 

p = Popen(param, stdout=PIPE, stderr=PIPE, shell=shell) 

out = p.communicate()[1].decode('utf-8') 

log.debug(out) 

except KeyboardInterrupt: 

p.terminate() 

log.error(out) 

 

def _grouper(n, iterable, fillvalue=None): 

"""Collects data into fixed-length chunks or blocks. 

 

Args: 

 

Returns: 

 

""" 

 

args=[iter(iterable)] * n 

return itertools.zip_longest(*args, fillvalue=fillvalue) 

 

 

def show_doc_topic_matrix(output_folder, doc_topics='doc_topics.txt', topic_keys='topic_keys.txt', 

easy_file_format=False): 

"""Shows document-topic-mapping. 

Args: 

outfolder (str): Folder for MALLET output. 

doc_topics (str): Name of MALLET's doc_topic file. Defaults to 'doc_topics.txt'. 

topic_keys (str): Name of MALLET's topic_keys file. Defaults to 'topic_keys.txt'. 

 

ToDo: Prettify docnames 

""" 

 

doc_topics=os.path.join(output_folder, doc_topics) 

assert doc_topics 

topic_keys=os.path.join(output_folder, topic_keys) 

assert topic_keys 

 

doctopic_triples=[] 

mallet_docnames=[] 

topics=[] 

 

df=pd.read_csv(topic_keys, sep='\t', header=None, encoding='utf-8') 

labels=[] 

for index, item in df.iterrows(): 

label=' '.join(item[2].split()[:3]) 

labels.append(label) 

 

with open(doc_topics, encoding='utf-8') as f: 

for line in f: 

li=line.lstrip() 

if li.startswith("#"): 

lines=f.readlines() 

for line in lines: 

docnum, docname, *values=line.rstrip().split('\t') 

mallet_docnames.append(docname) 

for topic, share in _grouper(2, values): 

triple=(docname, int(topic), float(share)) 

topics.append(int(topic)) 

doctopic_triples.append(triple) 

else: 

easy_file_format=True 

break 

 

if easy_file_format: 

newindex=[] 

doc_topic_matrix=pd.read_csv( 

doc_topics, sep='\t', names=labels[0:], encoding='utf-8') 

for eins, zwei in doc_topic_matrix.index: 

newindex.append(os.path.basename(zwei)) 

doc_topic_matrix.index=newindex 

else: 

# sort the triples 

# triple is (docname, topicnum, share) so sort(key=operator.itemgetter(0,1)) 

# sorts on (docname, topicnum) which is what we want 

doctopic_triples=sorted( 

doctopic_triples, key=operator.itemgetter(0, 1)) 

 

# sort the document names rather than relying on MALLET's ordering 

mallet_docnames=sorted(mallet_docnames) 

 

# collect into a document-term matrix 

num_docs=len(mallet_docnames) 

 

num_topics=max(topics) + 1 

 

# the following works because we know that the triples are in 

# sequential order 

data=np.zeros((num_docs, num_topics)) 

 

for triple in doctopic_triples: 

docname, topic, share=triple 

row_num=mallet_docnames.index(docname) 

data[row_num, topic]=share 

 

topicLabels=[] 

 

# creates list of topic lables consisting of the 3 most weighed topics 

df=pd.read_csv(topic_keys, sep='\t', header=None, encoding='utf-8') 

labels=[] 

for index, item in df.iterrows(): 

 

topicLabel=' '.join(item[2].split()[:3]) 

topicLabels.append(topicLabel) 

 

shortened_docnames=[] 

for item in mallet_docnames: 

shortened_docnames.append(os.path.basename(item)) 

 

''' 

for topic in range(max(topics)+1): 

topicLabels.append("Topic_" + str(topic)) 

''' 

doc_topic_matrix=pd.DataFrame(data=data[0:, 0:], 

index=shortened_docnames[0:], 

columns=topicLabels[0:]) 

return doc_topic_matrix.T 

 

def show_topics_keys(output_folder, topicsKeyFile="topic_keys.txt", num_topics=10): 

"""Show topic-key-mapping. 

 

Args: 

outfolder (str): Folder for Mallet output, 

topicsKeyFile (str): Name of Mallets' topic_key file, default "topic_keys" 

 

#topic-model-mallet 

Note: FBased on DARIAH-Tutorial -> https://de.dariah.eu/tatom/topic_model_mallet.html 

 

ToDo: Prettify index 

""" 

 

path_to_topic_keys=os.path.join(output_folder, topicsKeyFile) 

assert path_to_topic_keys 

 

with open(path_to_topic_keys, encoding='utf-8') as input: 

topic_keys_lines=input.readlines() 

 

topic_keys=[] 

#topicLabels=[] 

 

 

for line in topic_keys_lines: 

_, _, words=line.split('\t') # tab-separated 

words=words.rstrip().split(' ') # remove the trailing '\n' 

topic_keys.append(words) 

 

topicKeysMatrix=pd.DataFrame(topic_keys) 

topicKeysMatrix.index=['Topic ' + str(x + 1) for x in range(num_topics)] 

topicKeysMatrix.columns=['Key ' + str(x + 1) for x in range(10)] 

return topicKeysMatrix