Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

#!/usr/bin/env python3 

# -*- coding: utf-8 -*- 

 

""" 

Handling MALLET in Python 

************************* 

 

Functions and classes of this module are for **handling `MALLET <http://mallet.cs.umass.edu/topics.php>`_ \ 

in Python**. 

 

Contents 

******** 

* :func:`call_commandline()` 

* :class:`Mallet` 

* :func:`call_mallet()` 

* :func:`import_corpus()` 

* :func:`train_topics()` 

 

""" 

 

__author__ = "DARIAH-DE" 

__authors__ = "Steffen Pielstroem, Sina Bock, Severin Simmler" 

__email__ = "pielstroem@biozentrum.uni-wuerzburg.de" 

 

import itertools 

import logging 

import numpy as np 

import os 

import pandas as pd 

import re 

import random 

from dariah_topics import postprocessing 

import shutil 

import string 

from platform import system 

from subprocess import Popen, PIPE 

import tempfile 

 

log = logging.getLogger(__name__) 

log.addHandler(logging.NullHandler()) 

logging.basicConfig(level=logging.DEBUG, 

format='%(levelname)s %(name)s: %(message)s') 

 

 

def _decode(std): 

"""Decodes the bytes-like output of a subprocess in UTF-8. 

 

This private function is wrapped in :func:`call_commandline()`. 

 

Args: 

std (bytes-like): The ``stdout`` or ``stderr`` (or whatever) of a 

subprocess. 

 

Returns: 

A list of decoded strings. 

 

Example: 

>>> _decode([bytes('This is a test.', encoding='utf-8')]) 

['This is a test.'] 

""" 

return [line.decode('utf-8').replace('\n', '') for line in stdout] 

 

 

def call_commandline(cmd, stdin=None, stdout='pipe', stderr='pipe', communicate=True, logfile=False): 

"""Calls the command-line from within Python. 

 

With this function you can call the command-line with a specific command. Each \ 

argument has to be an element in a list (``cmd``). 

 

Args: 

cmd (list): A list of command-line arguments. 

stdin (str), optional: Value for stdin. Defaults to None. 

stdout (str), optional: Value for stdout. Defaults to ``pipe``. 

stderr (str), optional: Value for stderr. Defaults to ``pipe``. 

communicate (bool), optioanl: If True, ``stdout`` and ``stderr`` will be 

processed. Defaults to True. 

logfile (bool), optional: If True, a logfile (``commandline.log``) will 

be created. Otherwise ``stdout`` (and ``stderr``, respectively) will 

be printed as logging to the console (level: INFO). 

 

Returns: 

:class:`Popen` object of the subprocess. 

 

Example: 

>>> call_commandline(['mkdir', 'test_dir']) 

>>> os.path.exists('test_dir') 

True 

""" 

if stdin == 'pipe': 

stdin = PIPE 

if stdout == 'pipe': 

stdout = PIPE 

if stderr == 'pipe': 

stderr = PIPE 

 

if not all(isinstance(arg, str) for arg in cmd): 

cmd = [str(arg) for arg in cmd] 

 

log.info("Calling the command-line: {0} ...".format(' '.join(cmd))) 

 

process = Popen(cmd, stdin=stdin, stdout=stdout, stderr=stderr) 

decoded_stderr = _decode_stdout(process.stderr) 

 

if communicate: 

decoded_stderr = _decode(process.stderr) 

decoded_stdout = _decode(process.stdout) 

if logfile: 

log.info("Check commandline.log in '{0}' for logging.".format(os.getcwd())) 

with open('commandline.log', 'w', encoding='utf-8') as file: 

file.write('\n'.join(decoded_stdout)) 

file.write('\n'.join(decoded_stderr)) 

else: 

for line_stdout in decoded_stdout: 

log.info(line_stdout) 

for line_stdout in decoded_stderr: 

log.info(line_stderr) 

return process 

 

 

def _check_whitespace(string): 

"""Checks if whitespaces are in a string. 

 

This private function is wrapped in :func:`call_mallet()`. 

 

Args: 

string (str): Obviously, a string. 

 

Returns: 

True if whitespaces are **not** in the string, otherwise False. 

 

Example: 

>>> _check_whitespace('nowhitespace') 

True 

>>> _check_whitespace('white space') 

False 

""" 

if not re.search(r'\s', str(string)): 

return True 

else: 

return False 

 

 

def _check_mallet_output(keyword, kwargs=None): 

"""Checks if MALLET created output. 

 

This private function is wrapped in :func:`import_tokenized_corpus()` and \ 

:func:`train_topics()`. 

 

Args: 

keyword (str): A token, which has to be in ``kwargs.values()``. 

kwargs (dict), optional: Args for the MALLET functions. 

 

Raises: 

OSError, if MALLET did not produce any output files. 

""" 

if not 'corpus.mallet' in keyword: 

output_files = [value for arg, value in kwargs.items() if keyword in arg or 'txt' in str(value) or 'xml' in str(value)] 

else: 

output_files = [keyword] 

 

if not all(os.path.exists(file) for file in output_files): 

raise OSError("MALLET did not produce any output files. Maybe check your args?") 

 

class Mallet: 

"""Python wrapper for MALLET. 

 

With this class you can call the command-line tool `MALLET <http://mallet.cs.umass.edu/topics.php>`_ \ 

from within Python. 

""" 

def __init__(self, executable='mallet', corpus_output=None, logfile=True): 

self.executable = shutil.which(executable) 

if self.executable is None: 

raise FileNotFoundError(("The executable '{0}' could not be found.\n" 

"Either place the executable into the $PATH or call " 

"{1}(executable='/path/to/mallet')").format(executable, self.__class__.__name__)) 

if corpus_output is None: 

prefix = ''.join([random.choice(string.ascii_letters + string.digits) for n in range(5)]) 

self.corpus_output = os.path.join(tempfile.gettempdir(), prefix) 

else: 

self.corpus_output = corpus_output 

self.logfile = logfile 

 

 

def call_mallet(self, command, **kwargs): 

"""Calls the command-line tool MALLET. 

 

With this function you can call `MALLET <http://mallet.cs.umass.edu/topics.php>`_ \ 

using a specific ``command`` (e.g. ``train-topics``) and its parameters. 

**Whitespaces (especially for Windows users) are not allowed in paths.** 

 

Args: 

command (str): A MALLET command, this could be ``import-dir`` (load 

the contents of a directory into MALLET instances), ``import-file`` 

(load a single file into MALLET instances), ``import-svmlight`` 

(load SVMLight format data files into MALLET instances), ``info`` 

(get information about MALLET instances), ``train-classifier`` 

(train a classifier from MALLET data files), ``classify-dir`` 

(classify data from a single file with a saved classifier), ``classify-file`` 

(classify the contents of a directory with a saved classifier), 

``classify-svmlight`` (classify data from a single file in SVMLight 

format), ``train-topics`` (train a topic model from MALLET data 

files), ``infer-topics`` (use a trained topic model to infer topics 

for new documents), ``evaluate-topics`` (estimate the probability 

of new documents under a trained model), ``prune`` (remove features 

based on frequency or information gain), ``split`` (divide data 

into testing, training, and validation portions), ``bulk-load`` 

(for big input files, efficiently prune vocabulary and import docs). 

 

Returns: 

:class:`Popen` object of the MALLET subprocess. 

 

Example: 

>>> import tempfile 

>>> with tempfile.NamedTemporaryFile(suffix='.txt') as tmpfile: 

... tmpfile.write(b"This is a plain text example.") 

... tmpfile.flush() 

>>> Mallet = Mallet(corpus_output='.') 

>>> Mallet.call_mallet('import-file', input=tmpfile.name) 

>>> os.path.exists('text.vectors') 

True 

""" 

args = [self.executable, command] 

for option, value in kwargs.items(): 

args.append('--' + option.replace('_', '-')) 

if value is not None: 

args.append(value) 

 

if not all(_check_whitespace(arg) for arg in args): 

raise ValueError("Whitespaces are not allowed in '{0}'".format(args)) 

 

if self.logfile == True: 

communicate = True 

 

return call_commandline(args, communicate=communicate, logfile=self.logfile) 

 

 

def import_tokenized_corpus(self, tokenized_corpus, document_labels, **kwargs): 

"""Creates MALLET corpus model. 

 

With this function you can import a ``tokenized_corpus`` to create the \ 

MALLET corpus model. The MALLET command for this step is ``import-dir`` \ 

with ``--keep-sequence`` (which is already defined in the function, so \ 

you don't have to), but you have the ability to specify all available \ 

parameters. The output will be saved in ``output_corpus``. 

 

Args: 

tokenized_corpus (list): Tokenized corpus containing one or more 

iterables containing tokens. 

document_labels (list): Name of each `tokenized_document` in `tokenized_corpus`. 

encoding (str): Character encoding for input file. Defaults to UTF-8. 

token_regex (str): Divides documents into tokens using a regular 

expression (supports Unicode regex). Defaults to \p{L}[\p{L}\p{P}]+\p{L}. 

preserve_case (bool): If False, converts all word features to lowercase. 

Defaults to False. 

remove_stopwords (bool): Ignores a standard list of very common English 

tokens. Defaults to True. 

stoplist (str): Absolute path to plain text stopword list. Defaults to None. 

extra_stopwords (str): Read whitespace-separated words from this file, 

and add them to either the default English stoplist or the list 

specified by ``stoplist``. Defaults to None. 

stop_pattern_file (str): Read regular expressions from a file, one per 

line. Tokens matching these regexps will be removed. Defaults to None. 

skip_header (bool): If True, in each document, remove text occurring 

before a blank line. This is useful for removing email or UseNet 

headers. Defaults to False. 

skip_html (bool): If True, remove text occurring inside <...>, as in 

HTML or SGML. Defaults to False. 

replacement_files (str): Files containing string replacements, one per 

line: 'A B [tab] C' replaces A B with C, 'A B' replaces A B with A_B. 

Defaults to None. 

deletion_files (str): Files containing strings to delete after 

`replacements_files` but before tokenization (i.e. multiword stop 

terms). Defaults to False. 

keep_sequence_bigrams (bool): If True, final data will be a 

FeatureSequenceWithBigrams rather than a FeatureVector. Defaults to False. 

binary_features (bool): If True, features will be binary. Defaults to False. 

save_text_in_source (bool): If True, save original text of document in source. 

Defaults to False. 

print_output (bool): If True, print a representation of the processed data 

to standard output. This option is intended for debugging. Defaults to 

False. 

 

Returns: 

The absolute path to the created MALLET corpus file. 

 

Example: 

>>> tokenized_corpus = [['this', 'is', 'a', 'tokenized', 'document']] 

>>> document_labels = ['document_label'] 

>>> Mallet = Mallet(corpus_output='.') 

>>> Mallet.import_tokenized_corpus(tokenized_corpus, document_labels) 

>>> os.path.exists('corpus.mallet') 

True 

""" 

corpus_file = os.path.join(self.corpus_output, 'corpus.mallet') 

postprocessing.save_tokenized_corpus(tokenized_corpus, document_labels, self.corpus_output) 

self.call_mallet('import-dir', keep_sequence=None, input=self.corpus_output, output=corpus_file, **kwargs) 

 

_check_output(os.path.join(self.corpus_output, 'corpus.mallet')) 

 

return corpus_file 

 

 

def train_topics(self, mallet_binary, cleanup=True, **kwargs): 

"""Trains LDA model. 

 

With this function you can train a topic model. The MALLET command for \ 

this step is ``train-topics`` (which is already defined in the function, \ 

so you don't have to), but you have the ability to specify all available \ 

parameters. 

 

Args: 

mallet_binary (str): Path to MALLET corpus model. 

cleanup (bool): If True, the directory ``corpus_output`` will be removed 

after modeling. 

input_model (str): The filename from which to read the binary topic 

model. 

input_state (str): The filename from which to read the gzipped Gibbs 

sampling state created by ``output_state``. The original input 

file must be included, using ``input``. 

output_model (str): The filename in which to write the binary topic 

model at the end of the iterations. 

output_state (str): The filename in which to write the Gibbs sampling 

state after at the end of the iterations. 

output_model_interval (int): The number of iterations between writing 

the model (and its Gibbs sampling state) to a binary file. You must 

also set the ``output_model`` to use this option, whose argument 

will be the prefix of the filenames. Default is 0. 

output_state_interval (int): The number of iterations between 

writing the sampling state to a text file. You must also set 

the ``output_state`` to use this option, whose argument will be 

the prefix of the filenames. Default is 0. 

inferencer_filename (str): A topic inferencer applies a previously 

trained topic model to new documents. 

evaluator_filename (str): A held-out likelihood evaluator for new documents. 

output_topic_keys (str): The filename in which to write the top 

words for each topic and any Dirichlet parameters. 

num_top_words (int): The number of most probable words to print for 

each topic after model estimation. Default is 20. 

show_topics_interval (int): The number of iterations between printing 

a brief summary of the topics so far. Default is 50. 

topic_word_weights_file (str): The filename in which to write 

unnormalized weights for every topic and word type. 

word_topic_counts_file (str): The filename in which to write a sparse 

representation of topic-word assignments. 

diagnostics_file (str): The filename in which to write measures of 

topic quality, in XML format. 

Default is null 

xml_topic_report (str): The filename in which to write the top words 

for each topic and any Dirichlet parameters in XML format. 

xml_topic_phrase_report (str): The filename in which to write the top 

words and phrases for each topic and any Dirichlet parameters in 

XML format. 

num_top_docs (int): When writing topic documents with ``output_topic_docs``, 

report this number of top documents. Default is 100 

output_doc_topics (str): The filename in which to write the topic 

proportions per document, at the end of the iterations. 

doc_topics_threshold (float): Do not print topics with proportions less 

than this threshold value within ``output_doc_topics``. Defaults to 0.0. 

num_topics (int): Number of topics. Defaults to 10. 

num_top_words (int): Number of keywords for each topic. Defaults to 10. 

num_interations (int): Number of iterations. Defaults to 1000. 

num_threads (int): Number of threads for parallel training. Defaults to 1. 

num_icm_iterations (int): Number of iterations of iterated conditional 

modes (topic maximization). Defaults to 0. 

no_inference (bool): Load a saved model and create a report. Equivalent 

to ``num_iterations = 0``. Defaults to False. 

random_seed (int): Random seed for the Gibbs sampler. Defaults to 0. 

optimize_interval (int): Number of iterations between reestimating 

dirichlet hyperparameters. Defaults to 0. 

optimize_burn_in (int): Number of iterations to run before first 

estimating dirichlet hyperparameters. Defaults to 200. 

use_symmetric_alpha (bool): Only optimize the concentration parameter of 

the prior over document-topic distributions. This may reduce the 

number of very small, poorly estimated topics, but may disperse common 

words over several topics. Defaults to False. 

alpha (float): Sum over topics of smoothing over doc-topic distributions. 

``alpha_k = [this value] / [num topics]``. Defaults to 5.0. 

beta (float): Smoothing parameter for each topic-word. Defaults to 0.01. 

 

 

Returns: 

None. 

 

Example: 

>>> tokenized_corpus = [['this', 'is', 'a', 'tokenized', 'document']] 

>>> document_labels = ['document_label'] 

>>> Mallet = Mallet(corpus_output='.') 

>>> mallet_binary = Mallet.import_tokenized_corpus(tokenized_corpus, document_labels) 

>>> Mallet.train_topics(mallet_binary, output_model='model.mallet') 

>>> os.path.exists('model.mallet') 

True 

""" 

self.call_mallet('train-topics', input=mallet_binary, **kwargs) 

 

_check_output('output', kwargs, self.corpus_output) 

 

if cleanup: 

shutil.rmtree(self.corpus_output)