Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
# Initialize logging with logfile in tempdir:
# Initialize Flask application:
def index(): """Home page. """ logging.debug("Calling home page endpoint...") if "process" in globals() and process.is_alive(): logging.info("Terminating topic modeling process...") process.terminate() logging.debug("Initializing database...") # Initialize SQLite database: utils.init_db(web) logging.debug("Rendering home page template...") return flask.render_template("index.html", help=True)
def help(): """Help page. """ logging.debug("Rendering help page template...") return flask.render_template("help.html", go_back=True)
def error(): """Error page. """ logging.error("Rendering error page...") with utils.LOGFILE.open("r", encoding="utf-8") as logfile: log = logfile.read().split("\n")[-20:] return flask.render_template("error.html", reset=True, log="\n".join(log))
def modeling(): """Modeling page. """ logging.debug("Calling modeling page endpoint...") # Must be global to use anywhere: global start global process start = time.time() process = multiprocessing.Process(target=workflow.wrapper) logging.info("Initializing topic modeling process...") process.start() logging.info("Started topic modeling process.") logging.debug("Rendering modeling page template...") return flask.render_template("modeling.html", abort=True)
def overview_topics(): """Topics overview page. """ logging.debug("Calling topics overview page endpoint...") logging.info("Get document-topic distributions...") response = get_document_topic_distributions() document_topic = pd.read_json(response, orient="index")
logging.info("Get token frequencies...") response = get_token_frequencies() token_freqs = json.loads(response)
logging.info("Add frequencies to weights...") document_topic = document_topic.multiply(token_freqs, axis=0)
logging.info("Sum the weights...") dominance = document_topic.sum(axis=0)
logging.info("Scale weights...") proportions = utils.scale(dominance) proportions = pd.Series(proportions, index=dominance.index) proportions = proportions.sort_values(ascending=False)
# Convert pandas.Series to a 2-D array: proportions = list(utils.series2array(proportions))
corpus_size = get_corpus_size() number_topics = get_number_of_topics() logging.debug("Rendering topics overview template...") return flask.render_template("overview-topics.html", current="topics", help=True, reset=True, topics=True, documents=True, document_topic_distributions=True, parameters=True, export_data=True, proportions=proportions, corpus_size=corpus_size, number_topics=number_topics)
def overview_documents(): """Documents overview page. """ logging.debug("Calling documents overview page endpoint...") sizes = pd.DataFrame(get_textfile_sizes(), columns=["title", "size"])
proportions = utils.scale(sizes["size"]) proportions = pd.Series(proportions, index=sizes["title"]) proportions = proportions.sort_values(ascending=False)
# Convert pandas.Series to a 2-D array: proportions = list(utils.series2array(proportions))
corpus_size = get_corpus_size() return flask.render_template("overview-documents.html", current="documents", help=True, reset=True, topics=True, documents=True, document_topic_distributions=True, parameters=True, export_data=True, proportions=proportions, corpus_size=corpus_size)
def document_topic_distributions(): """Document-topic distributions page. """ logging.debug("Calling document-topic distributions endpoint...") logging.debug("Rendering document-topic distributions page template...") return flask.render_template("document-topic-distributions.html", current="document-topic-distributions", help=True, reset=True, topics=True, documents=True, document_topic_distributions=True, parameters=True, export_data=True)
def topics(topic): """Topic page. """ logging.debug("Calling topic page endpoint...") logging.info("Get topics...") topics = json.loads(get_topics()) logging.info("Get document-topic distributions...") document_topic = pd.read_json(get_document_topic_distributions(), orient="index") logging.info("Get topic similarity matrix...") topic_similarites = pd.read_json(get_topic_similarities())
logging.info("Get related documents...") related_docs = document_topic[topic].sort_values(ascending=False)[:10] related_docs_proportions = utils.scale(related_docs, minimum=70) related_docs_proportions = pd.Series(related_docs_proportions, index=related_docs.index) related_docs_proportions = related_docs_proportions.sort_values(ascending=False)
# Convert pandas.Series to a 2-D array: related_docs_proportions = list(utils.series2array(related_docs_proportions))
logging.info("Get related words...") related_words = topics[topic][:15]
logging.info("Get similar topics...") similar_topics = topic_similarites[topic].sort_values(ascending=False)[1:4] logging.debug("Rendering topic page template...") return flask.render_template("detail-topic.html", current="topics", help=True, reset=True, topics=True, documents=True, document_topic_distributions=True, parameters=True, export_data=True, topic=topic, similar_topics=similar_topics.index, related_words=related_words, related_documents=related_docs_proportions)
def documents(title): """Document page. """ logging.debug("Calling document page endpoint...") logging.info("Get textfiles...") text = get_textfile(title) logging.info("Get document-topics distributions...") document_topic = pd.read_json(get_document_topic_distributions(), orient="index") logging.info("Get document similarity matrix...") document_similarites = pd.read_json(get_document_similarities())
logging.info("Get related topics...") related_topics = document_topic.loc[title].sort_values(ascending=False) * 100 distribution = list(related_topics.to_dict().items())
logging.info("Get similar documents...") similar_docs = document_similarites[title].sort_values(ascending=False)[1:4]
logging.debug("Use only the first 10000 characters (or less) from document...") text = text if len(text) < 10000 else "{}... This was an excerpt of the original text.".format(text[:10000])
logging.debug("Split paragraphs...") text = text.split("\n\n") logging.debug("Rendering document page template...") return flask.render_template("detail-document.html", current="documents", help=True, reset=True, topics=True, documents=True, document_topic_distributions=True, parameters=True, export_data=True, title=title, text=text, distribution=distribution, similar_documents=similar_docs.index, related_topics=related_topics.index)
def parameters(): """Paramter page. """ logging.debug("Calling parameters page endpoint...") logging.info("Get parameters...") data = json.loads(get_parameters())[0] info = json.loads(data) logging.debug("Rendering parameters page template...") return flask.render_template("overview-parameters.html", current="parameters", parameters=True, help=True, reset=True, topics=True, documents=True, document_topic_distributions=True, export_data=True, **info)
# API endpoints:
def get_status(): """Current modeling status. """ seconds = int(time.time() - start) elapsed_time = datetime.timedelta(seconds=seconds) with utils.LOGFILE.open("r", encoding="utf-8") as logfile: messages = logfile.readlines() message = messages[-1].strip() message = utils.format_logging(message) return "Elapsed time: {}<br>{}".format(elapsed_time, message)
def get_document_topic_distributions(): """Document-topics distributions. """ return database.select("document_topic_distributions")
def get_topics(): """Topics. """ return database.select("topics")
def get_document_similarities(): """Document similarity matrix. """ return database.select("document_similarities")
def get_topic_similarities(): """Topic similarity matrix. """ return database.select("topic_similarities")
def get_textfile(title): """Textfiles. """ return database.select("textfile", title=title)
def get_stopwords(): """Stopwords. """ return database.select("stopwords")
def get_token_frequencies(): """Token frequencies per document. """ return database.select("token_freqs")
def get_parameters(): """Model parameters. """ return json.dumps(database.select("parameters"))
def get_textfile_sizes(): """Textfile sizes. """ return database.select("textfile_sizes")
def get_corpus_size(): """Corpus size. """ return str(len(get_textfile_sizes()))
def get_number_of_topics(): """Number of topics. """ return str(len(json.loads(get_topics())))
def export(filename): """Data archive. """ if "topicsexplorer-data.zip" in {filename}: utils.export_data() path = Path(utils.TEMPDIR, filename) return flask.send_file(filename_or_fp=str(path))
def handle_http_exception(e): """Error page. """ return error()
def add_header(r): r.headers["Cache-Control"] = "no-cache, no-store, must-revalidate" r.headers["Pragma"] = "no-cache" r.headers["Expires"] = "0" r.headers["Cache-Control"] = "public, max-age=0" return r
def close_connection(exception): db = getattr(flask.g, "_database", None) if db is not None: db.close() |