Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
#!/usr/bin/env python3 # -*- coding: utf-8 -*-
Handling MALLET in Python *************************
Functions and classes of this module are for **handling `MALLET <http://mallet.cs.umass.edu/topics.php>`_ \ in Python**.
Contents ******** * :func:`call_commandline()` * :class:`Mallet` * :func:`call_mallet()` * :func:`import_corpus()` * :func:`train_topics()`
"""
format='%(levelname)s %(name)s: %(message)s')
return [line.decode('utf-8').replace('\n', '') for line in stdout]
if stdin == 'pipe': stdin = PIPE if stdout == 'pipe': stdout = PIPE if stderr == 'pipe': stderr = PIPE
cmd = [str(arg) for arg in cmd] log.info("Calling the command-line with {} ...".format(' '.join(cmd))) log.debug("stdin = {}".format(stdin)) log.debug("stdout = {}".format(stdout)) log.debug("stderr = {}".format(stderr))
p = Popen(cmd, stdin=stdin, stdout=stdout, stderr=stderr) decoded_stderr = _decode_stdout(p.stderr)
if communicate: if logfile: log.info("Check mallet.log in {} for logging.".format(os.getcwd())) with open('mallet.log', 'w', encoding='utf-8') as file: file.write('\n'.join(decoded_stderr)) else: [log.debug(line) for line in decoded_stderr] elif p.returncode != 0: raise OSError(decoded_stderr) else: decoded_stdout = _decode_stdout(p.stdout) log.debug(decoded_stdout) return None
if not re.search(r'\s', str(string)): return True else: return False
"Either place the executable into the $PATH or call " "{1}(executable='/path/to/mallet')").format(executable, self.__class__.__name__)) if temp_output is None: prefix = ''.join([random.choice(string.ascii_letters + string.digits) for n in range(5)]) temp_output = os.path.join(tempfile.gettempdir(), prefix) self.temp_output = temp_output self.logfile = logfile
args = [self.executable, command] for option, value in kwargs.items(): args.append('--' + option.replace('_', '-')) if value is not None: args.append(value) if not all(_check_whitespace(arg) for arg in args): raise ValueError("Whitespaces are not allowed in {}".format(args)) return call_commandline(args, self.logfile)
mallet_binary = os.path.join(self.temp_output, 'corpus.mallet') postprocessing.save_tokenized_corpus(tokenized_corpus, document_labels, self.temp_output) self.call_mallet('import-dir', keep_sequence=None, input=self.temp_output, output=mallet_binary, **kwargs) return mallet_binary
self.call_mallet('train-topics', input=mallet_binary, **kwargs) if cleanup: shutil.rmtree(self.temp_output)
|