Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

import utils 

import lda 

import database 

import json 

import cophi 

import numpy as np 

import pandas as pd 

 

def wrapper(): 

"""Wrapper for the topic modeling workflow. 

""" 

data = utils.get_data("corpus", 

"topics", 

"iterations", 

"stopwords", 

"mfw") 

database.insert_into("textfiles", data["corpus"]) 

 

# 1. Preprocess: 

dtm = preprocess(data) 

# 2. Create model: 

model = create_model(dtm, data["topics"], data["iterations"]) 

# 3. Get model output: 

topics, descriptors, doc_topic = get_model_output(model, dtm) 

# 4. Calculate similarities: 

topic_sim, doc_sim = get_similarities(doc_topic) 

 

data = {"doc_topic": doc_topic.to_json(force_ascii=False), 

"topics": json.dumps(topics, ensure_ascii=False), 

"doc_sim": doc_sim.to_json(force_ascii=False), 

"topic_sim": topic_sim.to_json(force_ascii=False)} 

database.insert_into("model", data) 

 

 

def preprocess(data): 

"""Preprocess text data. 

""" 

# Constructing corpus: 

textfiles = database.select("textfiles") 

documents = utils.get_documents(textfiles) 

corpus = cophi.model.Corpus(documents) 

# Cleaning corpus: 

stopwords = utils.get_stopwords(data, corpus) 

hapax = corpus.hapax 

features = set(stopwords).union(set(hapax)) 

dtm = corpus.drop(corpus.dtm, features) 

return dtm 

 

 

def create_model(dtm, topics, iterations): 

"""Create a topic model. 

""" 

model = lda.LDA(n_topics=topics, 

n_iter=iterations) 

model.fit(dtm.values) 

return model 

 

 

def get_model_output(model, dtm): 

"""Get topics and distributions from topic model. 

""" 

# Topics and their descriptors: 

topics = list(utils.get_topics(model, dtm.columns)) 

descriptors = list(utils.get_topic_descriptors(topics)) 

# Document-topic distribution: 

doc_topic = utils.get_doc_topic(model, dtm.index, descriptors) 

return topics, descriptors, doc_topic 

 

 

def get_similarities(doc_topic): 

"""Calculate similarities between vectors. 

""" 

topics = utils.get_cosine(doc_topic.values, doc_topic.columns) 

documents = utils.get_cosine(doc_topic.T.values, doc_topic.index) 

return topics, documents