Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

import json 

import logging 

import xml 

 

import cophi 

import lda 

import numpy as np 

import pandas as pd 

 

from application import database 

from application import utils 

 

 

def wrapper(): 

"""Wrapper for the topic modeling workflow. 

""" 

try: 

logging.info("Just started topic modeling workflow.") 

data = utils.get_data("corpus", 

"topics", 

"iterations", 

"stopwords", 

"mfw") 

if len(data["corpus"]) < 10: 

raise ValueError("Your corpus is too small. " 

"Please select at least 10 text files.") 

logging.info("Fetched user data...") 

database.insert_into("textfiles", 

data["corpus"]) 

logging.info("Inserted data into database.") 

 

# 1. Preprocess: 

dtm, token_freqs, parameters = preprocess(data) 

logging.info("Successfully preprocessed data.") 

database.insert_into("token_freqs", 

json.dumps(token_freqs)) 

# 2. Create model: 

model = create_model(dtm, data["topics"], data["iterations"]) 

parameters["log_likelihood"] = int(model.loglikelihood()) 

database.insert_into("parameters", json.dumps(parameters)) 

logging.info("Successfully created topic model.") 

# 3. Get model output: 

topics, descriptors, document_topic = get_model_output(model, dtm) 

logging.info("Got model output.") 

# 4. Calculate similarities: 

topic_similarities, document_similarities = get_similarities(document_topic) 

logging.info("Successfully calculated topic and document similarities.") 

 

data = {"document_topic": document_topic.to_json(orient="index", force_ascii=False), 

"topics": json.dumps(topics, ensure_ascii=False), 

"document_similarities": document_similarities.to_json(force_ascii=False), 

"topic_similarities": topic_similarities.to_json(force_ascii=False)} 

database.insert_into("model", data) 

logging.info("Successfully inserted data into database.") 

logging.info("Very nice, great success!") 

except xml.etree.ElementTree.ParseError as error: 

logging.error("ERROR: There is something wrong with your XML files.") 

logging.error("ERROR: {}".format(error)) 

logging.error("Redirect to error page...") 

except UnicodeDecodeError as error: 

logging.error("ERROR: There is something wrong with your text files. " 

"Are they UTF-8 encoded?") 

logging.error("ERROR: {}".format(error)) 

logging.error("Redirect to error page...") 

except Exception as error: 

logging.error("ERROR: {}".format(error)) 

logging.error("Redirect to error page...") 

 

 

def preprocess(data): 

"""Preprocess text data. 

""" 

# Constructing corpus: 

textfiles = database.select("textfiles") 

documents = utils.get_documents(textfiles) 

corpus = cophi.model.Corpus(documents) 

num_tokens = corpus.num_tokens 

database.update("textfiles", num_tokens.to_dict()) 

# Get paramter: 

D, W = corpus.dtm.shape 

N = num_tokens.sum() 

# Cleaning corpus: 

stopwords = utils.get_stopwords(data, corpus) 

hapax = corpus.hapax 

features = set(stopwords).union(set(hapax)) 

logging.info("Cleaning corpus...") 

dtm = corpus.drop(corpus.dtm, features) 

# Save stopwords: 

database.insert_into("stopwords", json.dumps(stopwords)) 

# Save parameters: 

parameters = {"n_topics": int(data["topics"]), 

"n_iterations": int(data["iterations"]), 

"n_documents": int(D), 

"n_stopwords": int(len(stopwords)), 

"n_hapax": int(len(hapax)), 

"n_tokens": int(N), 

"n_types": int(W)} 

return dtm, num_tokens.tolist(), parameters 

 

 

def create_model(dtm, topics, iterations): 

"""Create a topic model. 

""" 

logging.info("Creating topic model...") 

model = lda.LDA(n_topics=topics, 

n_iter=iterations) 

model.fit(dtm.values) 

return model 

 

 

def get_model_output(model, dtm): 

"""Get topics and distributions from topic model. 

""" 

logging.info("Fetching model output...") 

# Topics and their descriptors: 

topics = dict(utils.get_topics(model, dtm.columns)) 

descriptors = list(topics.keys()) 

# Document-topic distribution: 

document_topic = utils.get_document_topic(model, dtm.index, descriptors) 

return topics, descriptors, document_topic 

 

 

def get_similarities(document_topic): 

"""Calculate similarities between vectors. 

""" 

logging.info("Calculating topic similarities...") 

topics = utils.get_cosine(document_topic.values, document_topic.columns) 

logging.info("Calculating document similarites...") 

documents = utils.get_cosine(document_topic.T.values, document_topic.index) 

return topics, documents