Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
"""Provide a dead process. """ return False
"""Initialize Flask application. """ template_folder=Path("application", "templates"), static_folder=Path("application", "static")) process = DeadProcess() return app, process
"""Initialize logging. """ format="%(message)s", filename=str(LOGFILE), filemode="w") # Disable logging for Flask and Werkzeug:
"""Initialize SQLite database. """ logging.debug("Initializing database...") db = database.get_db() with app.open_resource("schema.sql") as schemafile: schema = schemafile.read().decode("utf-8") db.executescript(schema) db.commit() database.close_db()
"""Format log messages. """ if "n_documents" in message: n = message.split("n_documents: ")[1] return "Number of documents: {}".format(n) elif "vocab_size" in message: n = message.split("vocab_size: ")[1] return "Number of types: {}".format(n) elif "n_words" in message: n = message.split("n_words: ")[1] return "Number of tokens: {}".format(n) elif "n_topics" in message: n = message.split("n_topics: ")[1] return "Number of topics: {}".format(n) elif "n_iter" in message: return "Initializing topic model..." elif "log likelihood" in message: iteration, _ = message.split("> log likelihood: ") return "Iteration {}".format(iteration[1:]) else: return message
"""Load text file, return title and content. """ filename = Path(secure_filename(textfile.filename)) title = filename.stem suffix = filename.suffix content = textfile.read().decode("utf-8") if suffix in {".xml", ".html"}: content = remove_markup(content) return title, content
"""Parse XML and drop tags. """ logging.info("Removing markup...") tree = ElementTree.fromstring(text) plaintext = ElementTree.tostring(tree, encoding="utf8", method="text") return plaintext.decode("utf-8")
"""Get Document objects. """ logging.info("Fetching documents...") for textfile in textfiles: title, content = textfile yield cophi.model.Document(content, title)
"""Get stopwords from file or corpus. """ logging.info("Fetching stopwords...") if "stopwords" in data: _, stopwords = load_textfile(data["stopwords"]) stopwords = cophi.model.Document(stopwords).tokens else: stopwords = corpus.mfw(data["mfw"]) return stopwords
"""Get data from HTML forms. """ logging.info("Fetching user data...") data = {"corpus": flask.request.files.getlist("corpus"), "topics": int(flask.request.form["topics"]), "iterations": int(flask.request.form["iterations"])} if flask.request.files.get("stopwords", None): data["stopwords"] = flask.request.files["stopwords"] else: data["mfw"] = int(flask.request.form["mfw"]) return data
"""Get topics from topic model. """ for distribution in model.topic_word_: words = list(np.array(vocabulary)[np.argsort(distribution)][:-maximum-1:-1]) yield "{}, ...".format(", ".join(words[:3])), words
"""Get document-topic distribution from topic model. """ document_topic = pd.DataFrame(model.doc_topic_) document_topic.index = titles document_topic.columns = descriptors return document_topic
"""Calculate cosine similarity between columns. """ d = matrix.T @ matrix norm = (matrix * matrix).sum(0, keepdims=True) ** .5 similarities = d / norm / norm.T return pd.DataFrame(similarities, index=descriptors, columns=descriptors)
"""Min-max scaler for a vector. """ return np.interp(vector, (vector.min(), vector.max()), (minimum, maximum))
logging.info("Creating data archive...") if DATA_EXPORT.exists(): unlink_content(DATA_EXPORT) else: DATA_EXPORT.mkdir() data_export = database.select("data_export") for name, data in data_export.items(): if name in {"stopwords"}: with Path(DATA_EXPORT, "{}.txt".format(name)).open("w", encoding="utf-8") as file: for word in data: file.write("{}\n".format(word)) else: path = Path(DATA_EXPORT, "{}.csv".format(name)) data.to_csv(path, sep=";", encoding="utf-8") shutil.make_archive(DATA_EXPORT, "zip", DATA_EXPORT)
"""Deletes the content of a directory. """ logging.info("Cleaning up in data directory...") for p in directory.rglob(pattern): if p.is_file(): p.unlink()
for i, v in zip(s.index, s): yield [i, v] |