Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
"""Provide a dead process. """
"""Initialize Flask application. """ template_folder=str(Path("application", "templates")), static_folder=str(Path("application", "static")))
"""Initialize logging. """ format="%(message)s", filename=str(LOGFILE), filemode="w") # Disable logging for Flask and Werkzeug:
"""Initialize SQLite database. """ logging.debug("Initializing database...") db = database.get_db() with app.open_resource("schema.sql") as schemafile: schema = schemafile.read().decode("utf-8") db.executescript(schema) db.commit() database.close_db()
"""Format log messages. """ else:
"""Load text file, return title and content. """ filename = Path(secure_filename(textfile.filename)) title = filename.stem suffix = filename.suffix content = textfile.read().decode("utf-8") if suffix in {".xml", ".html"}: content = remove_markup(content) return title, content
"""Parse XML and drop tags. """ encoding="utf8", method="text")
"""Get Document objects. """
"""Get stopwords from file or corpus. """ logging.info("Fetching stopwords...") if "stopwords" in data: _, stopwords = load_textfile(data["stopwords"]) stopwords = cophi.model.Document(stopwords).tokens else: stopwords = corpus.mfw(data["mfw"]) return stopwords
"""Get data from HTML forms. """ logging.info("Fetching user data...") data = {"corpus": flask.request.files.getlist("corpus"), "topics": int(flask.request.form["topics"]), "iterations": int(flask.request.form["iterations"])} if flask.request.files.get("stopwords", None): data["stopwords"] = flask.request.files["stopwords"] else: data["mfw"] = int(flask.request.form["mfw"]) return data
"""Get topics from topic model. """ for distribution in model.topic_word_: words = list(np.array(vocabulary)[np.argsort(distribution)][:-maximum-1:-1]) yield "{}, ...".format(", ".join(words[:3])), words
"""Get document-topic distribution from topic model. """ document_topic = pd.DataFrame(model.doc_topic_) document_topic.index = titles document_topic.columns = descriptors return document_topic
"""Calculate cosine similarity between columns. """
"""Min-max scaler for a vector. """ return np.interp(vector, (vector.min(), vector.max()), (minimum, maximum))
logging.info("Creating data archive...") if DATA_EXPORT.exists(): unlink_content(DATA_EXPORT) else: DATA_EXPORT.mkdir() data_export = database.select("data_export") for name, data in data_export.items(): if name in {"stopwords"}: with Path(DATA_EXPORT, "{}.txt".format(name)).open("w", encoding="utf-8") as file: for word in data: file.write("{}\n".format(word)) else: path = Path(DATA_EXPORT, "{}.csv".format(name)) data.to_csv(path, sep=";", encoding="utf-8") shutil.make_archive(DATA_EXPORT, "zip", DATA_EXPORT)
"""Deletes the content of a directory. """ logging.info("Cleaning up in data directory...") for p in directory.rglob(pattern): if p.is_file(): p.unlink()
for i, v in zip(s.index, s): yield [i, v] |