Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

419

420

421

422

423

424

425

426

427

428

429

#!/usr/bin/env python3 

# -*- coding: utf-8 -*- 

 

"""MALLET wrapper for Python. 

 

This module contains various `MALLET`_ related functions for Topic Modeling 

provided by `DARIAH-DE`_. 

 

.. _MALLET: 

http://mallet.cs.umass.edu/ 

.. _DARIAH-DE: 

https://de.dariah.eu 

https://github.com/DARIAH-DE 

""" 

 

__author__ = "DARIAH-DE" 

__authors__ = "Steffen Pielstroem, Sina Bock, Severin Simmler" 

__email__ = "pielstroem@biozentrum.uni-wuerzburg.de" 

__date__ = "2017-03-28" 

 

import itertools 

import logging 

import numpy as np 

import operator 

import os 

import pandas as pd 

from platform import system 

from subprocess import Popen, PIPE 

 

log = logging.getLogger('mallet') 

log.addHandler(logging.NullHandler()) 

logging.basicConfig(level=logging.INFO, 

format='%(levelname)s %(name)s: %(message)s') 

 

 

def create_mallet_binary(outfile='binary.mallet', outfolder='mallet_output', 

path_to_file=False, path_to_corpus=os.path.join(os.path.abspath('.'), 'corpus_txt'), 

path_to_mallet="mallet", keep_sequence=False, preserve_case=False, 

remove_stopwords=True, stoplist=None, token_regex=False, use_pipe_from=False, 

replacement_files=False, deletion_files=False, extra_stopwords=False): 

"""Creates a MALLET binary file. 

 

Args: 

outfile (str): Name of the MALLET file that will be generated. Defaults to 'binary.mallet'. 

outfolder (str): Folder for output. 

path_to_file (str): Absolute path to text file, e.g. '/home/workspace/testfile.txt'. 

path_to_corpus (str): Absolute path to corpus folder, e.g. '/home/workspace/corpus_txt'. 

path_to_mallet (str): Path to MALLET. Defaults to 'mallet'. 

If MALLET is not properly installed use absolute path, e.g. '/home/workspace/mallet/bin/mallet'. 

keep_sequence (bool): Preserves the document as a sequence of word features, 

rather than a vector of word feature counts. Use this 

option for sequence labeling tasks. MALLET also requires 

feature sequences rather than feature vectors. Defaults 

to False. 

preserve_case (bool): Converts all word features to lowercase. Defaults to False. 

remove_stopwords (bool): Ignores a standard list of very common tokens. Defaults to True. 

stoplist (str): Absolute path to plain text stopword list. Defaults to None. 

token_regex (str): Divides documents into tokens using a regular expression. 

Defaults to False. 

use_pipe_from (str): Use the pipe and alphabets from a previously created vectors file. 

Allows the creation, for example, of a test set of vectors that are 

compatible with a previously created set of training vectors. Defaults to False. 

replacement_files (str): Files containing string replacements, one per line: 

'A B [tab] C' replaces A B with C, 

'A B' replaces A B with A_B 

deletion_files (str): Files containing strings to delete after replacements but before 

tokenization (i.e. multiword stop terms). 

Returns: 

String. Absolute path to created MALLET binary file. 

""" 

if system() == 'Windows': 

shell = True 

else: 

shell = False 

 

path_to_binary = os.path.join(outfolder, outfile) 

 

if not os.path.exists(outfolder): 

log.info("Creating output folder ...") 

os.makedirs(outfolder) 

 

param = [path_to_mallet] 

if not path_to_file: 

param.append('import-dir') 

param.append('--input') 

param.append(path_to_corpus) 

else: 

param.append('import-file') 

param.append('--input') 

param.append(path_to_file) 

if preserve_case: 

param.append('--preserve-case') 

if remove_stopwords: 

param.append('--remove-stopwords') 

if stoplist is not None: 

param.append('--stoplist-file') 

param.append(stoplist) 

if token_regex is not False: 

param.append('--token-regex') 

param.append(token_regex) 

if use_pipe_from: 

param.append('--use-pipe-from') 

param.append(use_pipe_from) 

if replacement_files: 

param.append('--replacement-files') 

param.append(replacement_files) 

if deletion_files: 

param.append('--deletion-files') 

param.append(deletion_files) 

if extra_stopwords: 

param.append('--extra-stopwords') 

param.append(extra_stopwords) 

 

param.append('--output') 

param.append(os.path.join(path_to_binary)) 

param.append('--keep-sequence') 

 

try: 

log.info("Accessing MALLET with %s ...", param) 

p = Popen(param, stdout=PIPE, stderr=PIPE, shell=shell) 

out = p.communicate()[0].decode('utf-8') 

log.debug(out) 

except KeyboardInterrupt: 

p.terminate() 

log.error(out) 

return path_to_binary 

 

 

def create_mallet_model(path_to_binary, outfolder, path_to_mallet='mallet', num_topics=False, 

num_top_words=False, num_iterations=False, num_threads=False, 

num_icm_iterations=False, no_inference=False, random_seed=False, 

optimize_interval=False, optimize_burn_in=False, use_symmetric_alpha=False, 

alpha=False, beta=False, output_topic_keys=True, topic_word_weights_file=True, 

word_topic_counts_file=True, diagnostics_file=True, xml_topic_report=True, 

xml_topic_phrase_report=False, output_topic_docs=False, num_top_docs=False, 

output_doc_topics=True, doc_topics_threshold=False, output_model=True, 

output_state=True, doc_topics_max=False): 

"""Creates MALLET model. 

 

Note: Use `create_mallet_binary()` to create `path_to_binary`. 

 

Args: 

path_to_binary (str): Path to MALLET binary. 

outfolder (str): Folder for MALLET output. 

path_to_mallet (str): Path to MALLET. Defaults to 'mallet'. 

If MALLET is not properly installed use absolute path, e.g. '/home/workspace/mallet/bin/mallet'. 

num_topics (int): Number of topics. Defaults to False. 

num_interations (int): Number of iterations. Defaults to False. 

num_top_words (int): Number of keywords for each topic. Defaults to False. 

num_threads (int): Number of threads for parallel training. Defaults to False. 

num_icm_iterations (int): Number of iterations of iterated conditional modes (topic maximization). Defaults to False.. 

no_inference (bool): Load a saved model and create a report. Equivalent to `num_iterations = 0`. Defaults to False. 

random_seed (int): Random seed for the Gibbs sampler. Defaults to False. 

optimize_interval (int): Number of iterations between reestimating dirichlet hyperparameters. Defaults to False. 

optimize_burn_in (int): Number of iterations to run before first estimating dirichlet hyperparameters. Defaults to False. 

use_symmetric_alpha (bool): Only optimize the concentration parameter of the prior over document-topic 

distributions. This may reduce the number of very small, poorly estimated 

topics, but may disperse common words over several topics. Defaults to False. 

alpha (float): Sum over topics of smoothing over doc-topic distributions. alpha_k = [this value] / [num topics]. Defaults to False. 

beta (float): Smoothing parameter for each topic-word. Defaults to False. 

output_topic_keys (bool): Write the top words for each topic and any Dirichlet parameters. Defaults to True. 

topic_word_weights_file (bool): Write unnormalized weights for every topic and word type. Defaults to False. 

word_topic_counts_file (bool): Write a sparse representation of topic-word assignments. By default this is 

null, indicating that no file will be written. Defaults to False. 

diagnostics_file (bool): Write measures of topic quality, in XML format. By default this is null, 

indicating that no file will be written. Defaults to False. 

xml_topic_report (bool): Write the top words for each topic and any Dirichlet parameters in XML format. Defaults to False. 

xml_topic_phrase_report (bool): Write the top words and phrases for each topic and any Dirichlet parameters in XML format. Defaults to False. 

output_topic_docs (bool): Write the most prominent documents for each topic, at the end of the iterations. Defaults to False. 

num_top_docs (int): Number of top documents for `output_topic_docs`. Defaults to False. 

output_doc_topics (bool): Write the topic proportions per document, at the end of the iterations. Defaults to True. 

doc_topics_threshold (float): Do not print topics with proportions less than this threshold value within `output_doc_topics`. Defaults to False. 

doc_topics_max (int): Do not print more than `int` number of topics. A negative value indicates that all topics should be printed. Defaults to False. 

output_model (bool): Write a serialized MALLET topic trainer object. This type of output is appropriate for pausing 

and restarting training, but does not produce data that can easily be analyzed. Defaults to False. 

output_state (bool): Output a compressed text file containing the words in the corpus with their topic assignments. 

The file format can easily be parsed and used by non-Java-based software. Defaults to True. 

ToDo: 

Param 'output_topic_docs' is causing an internal error  

-> Exception in thread "main" java.lang.ClassCastException: java.net.URI cannot be cast to java.lang.String 

-> at cc.mallet.topics.ParallelTopicModel.printTopicDocuments(ParallelTopicModel.java:1773) 

-> at cc.mallet.topics.tui.TopicTrainer.main(TopicTrainer.java:281) 

Para, 'num_top_docs' is obsolete, refering to 'output_topic_docs' 

 

Returns: 

None 

""" 

if system() == 'Windows': 

log.debug(outfolder) 

shell = True 

else: 

log.debug(outfolder) 

shell = False 

 

outfolder = os.path.join(os.path.abspath('.'), outfolder) 

param = [path_to_mallet, 'train-topics', '--input', path_to_binary] 

 

# parameter: 

if num_topics is not False: 

param.append('--num-topics') 

param.append(str(num_topics)) 

if num_iterations is not False: 

param.append('--num-iterations') 

param.append(str(num_iterations)) 

if num_threads is not False: 

param.append('--num-threads') 

param.append(str(num_threads)) 

if num_top_words is not False: 

param.append('--num-top-words') 

param.append(str(num_top_words)) 

if num_icm_iterations is not False: 

param.append('--num-icm-iterations') 

param.append(str(num_icm_iterations)) 

if no_inference is not False: 

param.append('--no-inference') 

param.append(str(no_inference)) 

if random_seed is not False: 

param.append('--random-seed') 

param.append(str(random_seed)) 

 

# hyperparameter: 

if optimize_interval is not False: 

param.append('--optimize-interval') 

param.append(optimize_interval) 

if optimize_burn_in is not False: 

param.append('--optimize-burn-in') 

param.append(optimize_burn_in) 

if use_symmetric_alpha is not False: 

param.append('--use-symmetric-alpha') 

param.append(use_symmetric_alpha) 

if alpha is not False: 

param.append('--alpha') 

param.append(alpha) 

if beta is not False: 

param.append('--beta') 

param.append(beta) 

 

# output: 

if output_topic_keys: 

param.append('--output-topic-keys') 

param.append(os.path.join(outfolder, 'topic_keys.txt')) 

if topic_word_weights_file: 

param.append('--topic-word-weights-file') 

param.append(os.path.join(outfolder, 'topic_word_weights.txt')) 

if word_topic_counts_file: 

param.append('--word-topic-counts-file') 

param.append(os.path.join(outfolder, 'word_topic_counts.txt')) 

if diagnostics_file: 

param.append('--diagnostics-file') 

param.append(os.path.join(outfolder, 'diagnostics.txt')) 

if xml_topic_report: 

param.append('--xml-topic-report') 

param.append(os.path.join(outfolder, 'topic_report.xml')) 

if xml_topic_phrase_report: 

param.append('--xml-topic-phrase-report') 

param.append(os.path.join(outfolder, 'topic_phrase_report.xml')) 

#not yet working 

if output_topic_docs: 

param.append('--output-topic-docs') 

param.append(os.path.join(outfolder, 'topic_docs.txt')) 

if num_top_docs is not False: 

param.append('--num-top-docs') 

param.append(topic_word_weights_file) 

if output_doc_topics: 

param.append('--output-doc-topics') 

param.append(os.path.join(outfolder, 'doc_topics.txt')) 

if doc_topics_threshold: 

param.append('--doc-topics-threshold') 

param.append(topic_word_weights_file) 

if doc_topics_max: 

param.append('--doc-topics-max') 

param.append(topic_word_weights_file) 

if output_model: 

param.append('--output-model') 

param.append(os.path.join(outfolder, 'mallet.model')) 

if output_state: 

param.append('--output-state') 

param.append(os.path.join(outfolder, 'state.gz')) 

 

try: 

log.info("Accessing Mallet with %s ...", param) 

p = Popen(param, stdout=PIPE, stderr=PIPE, shell=shell) 

out = p.communicate()[1].decode('utf-8') 

log.debug(out) 

except KeyboardInterrupt: 

p.terminate() 

log.error(out) 

 

def _grouper(n, iterable, fillvalue=None): 

"""Collects data into fixed-length chunks or blocks. 

 

Args: 

 

Returns: 

 

""" 

 

args=[iter(iterable)] * n 

return itertools.zip_longest(*args, fillvalue=fillvalue) 

 

 

def show_doc_topic_matrix(output_folder, doc_topics='doc_topics.txt', topic_keys='topic_keys.txt', 

easy_file_format=False): 

"""Shows document-topic-mapping. 

Args: 

outfolder (str): Folder for MALLET output. 

doc_topics (str): Name of MALLET's doc_topic file. Defaults to 'doc_topics.txt'. 

topic_keys (str): Name of MALLET's topic_keys file. Defaults to 'topic_keys.txt'. 

 

ToDo: Prettify docnames 

""" 

 

doc_topics=os.path.join(output_folder, doc_topics) 

assert doc_topics 

topic_keys=os.path.join(output_folder, topic_keys) 

assert topic_keys 

 

doctopic_triples=[] 

mallet_docnames=[] 

topics=[] 

 

df=pd.read_csv(topic_keys, sep='\t', header=None, encoding='utf-8') 

labels=[] 

for index, item in df.iterrows(): 

label=' '.join(item[2].split()[:3]) 

labels.append(label) 

 

with open(doc_topics, encoding='utf-8') as f: 

for line in f: 

li=line.lstrip() 

if li.startswith("#"): 

lines=f.readlines() 

for line in lines: 

docnum, docname, *values=line.rstrip().split('\t') 

mallet_docnames.append(docname) 

for topic, share in _grouper(2, values): 

triple=(docname, int(topic), float(share)) 

topics.append(int(topic)) 

doctopic_triples.append(triple) 

else: 

easy_file_format=True 

break 

 

if easy_file_format: 

newindex=[] 

doc_topic_matrix=pd.read_csv( 

doc_topics, sep='\t', names=labels[0:], encoding='utf-8') 

for eins, zwei in doc_topic_matrix.index: 

newindex.append(os.path.basename(zwei)) 

doc_topic_matrix.index=newindex 

else: 

# sort the triples 

# triple is (docname, topicnum, share) so sort(key=operator.itemgetter(0,1)) 

# sorts on (docname, topicnum) which is what we want 

doctopic_triples=sorted( 

doctopic_triples, key=operator.itemgetter(0, 1)) 

 

# sort the document names rather than relying on MALLET's ordering 

mallet_docnames=sorted(mallet_docnames) 

 

# collect into a document-term matrix 

num_docs=len(mallet_docnames) 

 

num_topics=max(topics) + 1 

 

# the following works because we know that the triples are in 

# sequential order 

data=np.zeros((num_docs, num_topics)) 

 

for triple in doctopic_triples: 

docname, topic, share=triple 

row_num=mallet_docnames.index(docname) 

data[row_num, topic]=share 

 

topicLabels=[] 

 

# creates list of topic lables consisting of the 3 most weighed topics 

df=pd.read_csv(topic_keys, sep='\t', header=None, encoding='utf-8') 

labels=[] 

for index, item in df.iterrows(): 

 

topicLabel=' '.join(item[2].split()[:3]) 

topicLabels.append(topicLabel) 

 

shortened_docnames=[] 

for item in mallet_docnames: 

shortened_docnames.append(os.path.basename(item)) 

 

''' 

for topic in range(max(topics)+1): 

topicLabels.append("Topic_" + str(topic)) 

''' 

doc_topic_matrix=pd.DataFrame(data=data[0:, 0:], 

index=shortened_docnames[0:], 

columns=topicLabels[0:]) 

return doc_topic_matrix.T 

 

def show_topics_keys(output_folder, topicsKeyFile="topic_keys.txt", num_topics=10): 

"""Show topic-key-mapping. 

 

Args: 

outfolder (str): Folder for Mallet output, 

topicsKeyFile (str): Name of Mallets' topic_key file, default "topic_keys" 

 

#topic-model-mallet 

Note: FBased on DARIAH-Tutorial -> https://de.dariah.eu/tatom/topic_model_mallet.html 

 

ToDo: Prettify index 

""" 

 

path_to_topic_keys=os.path.join(output_folder, topicsKeyFile) 

assert path_to_topic_keys 

 

with open(path_to_topic_keys, encoding='utf-8') as input: 

topic_keys_lines=input.readlines() 

 

topic_keys=[] 

#topicLabels=[] 

 

 

for line in topic_keys_lines: 

_, _, words=line.split('\t') # tab-separated 

words=words.rstrip().split(' ') # remove the trailing '\n' 

topic_keys.append(words) 

 

topicKeysMatrix=pd.DataFrame(topic_keys) 

topicKeysMatrix.index=['Topic ' + str(x + 1) for x in range(num_topics)] 

topicKeysMatrix.columns=['Key ' + str(x + 1) for x in range(10)] 

return topicKeysMatrix