Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
"""Wrapper for the topic modeling workflow. """ try: logging.info("Just started topic modeling workflow.") data = utils.get_data("corpus", "topics", "iterations", "stopwords", "mfw") if len(data["corpus"]) < 10: raise ValueError("Your corpus is too small. " "Please select at least 10 text files.") logging.info("Fetched user data...") database.insert_into("textfiles", data["corpus"]) logging.info("Inserted data into database.")
# 1. Preprocess: dtm, token_freqs, parameters = preprocess(data) logging.info("Successfully preprocessed data.") database.insert_into("token_freqs", json.dumps(token_freqs)) # 2. Create model: model = create_model(dtm, data["topics"], data["iterations"]) parameters["log_likelihood"] = int(model.loglikelihood()) database.insert_into("parameters", json.dumps(parameters)) logging.info("Successfully created topic model.") # 3. Get model output: topics, descriptors, document_topic = get_model_output(model, dtm) logging.info("Got model output.") # 4. Calculate similarities: topic_similarities, document_similarities = get_similarities(document_topic) logging.info("Successfully calculated topic and document similarities.")
data = {"document_topic": document_topic.to_json(orient="index", force_ascii=False), "topics": json.dumps(topics, ensure_ascii=False), "document_similarities": document_similarities.to_json(force_ascii=False), "topic_similarities": topic_similarities.to_json(force_ascii=False)} database.insert_into("model", data) logging.info("Successfully inserted data into database.") logging.info("Very nice, great success!") except xml.etree.ElementTree.ParseError as error: logging.error("ERROR: There is something wrong with your XML files.") logging.error("ERROR: {}".format(error)) logging.error("Redirect to error page...") except UnicodeDecodeError as error: logging.error("ERROR: There is something wrong with your text files. " "Are they UTF-8 encoded?") logging.error("ERROR: {}".format(error)) logging.error("Redirect to error page...") except Exception as error: logging.error("ERROR: {}".format(error)) logging.error("Redirect to error page...")
"""Preprocess text data. """ # Constructing corpus: textfiles = database.select("textfiles") documents = utils.get_documents(textfiles) corpus = cophi.model.Corpus(documents) num_tokens = corpus.num_tokens database.update("textfiles", num_tokens.to_dict()) # Get paramter: D, W = corpus.dtm.shape N = num_tokens.sum() # Cleaning corpus: stopwords = utils.get_stopwords(data, corpus) hapax = corpus.hapax features = set(stopwords).union(set(hapax)) logging.info("Cleaning corpus...") dtm = corpus.drop(corpus.dtm, features) # Save stopwords: database.insert_into("stopwords", json.dumps(stopwords)) # Save parameters: parameters = {"n_topics": int(data["topics"]), "n_iterations": int(data["iterations"]), "n_documents": int(D), "n_stopwords": int(len(stopwords)), "n_hapax": int(len(hapax)), "n_tokens": int(N), "n_types": int(W)} return dtm, num_tokens.tolist(), parameters
"""Create a topic model. """ logging.info("Creating topic model...") model = lda.LDA(n_topics=topics, n_iter=iterations) model.fit(dtm.values) return model
"""Get topics and distributions from topic model. """ logging.info("Fetching model output...") # Topics and their descriptors: topics = dict(utils.get_topics(model, dtm.columns)) descriptors = list(topics.keys()) # Document-topic distribution: document_topic = utils.get_document_topic(model, dtm.index, descriptors) return topics, descriptors, document_topic
"""Calculate similarities between vectors. """ logging.info("Calculating topic similarities...") topics = utils.get_cosine(document_topic.values, document_topic.columns) logging.info("Calculating document similarites...") documents = utils.get_cosine(document_topic.T.values, document_topic.index) return topics, documents |