Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
# -*- coding: utf-8 -*-
This module contains functions for LDA visualization provided by `DARIAH-DE`_.
.. _Gensim: https://radimrehurek.com/gensim/index.html .. _Mallet: http://mallet.cs.umass.edu/ .. _DARIAH-DE: https://de.dariah.eu https://github.com/DARIAH-DE """
format = '%(levelname)s %(name)s: %(message)s')
# Adapted from code by Stefan Pernes """Creates a document-topic-matrix.
Description: With this function you can create a doc-topic-maxtrix for gensim output.
Args: corpus (mmCorpus): Gensim corpus. model: Gensim LDA model doc_labels (list): List of document labels.
Returns: Doc_topic-matrix as DataFrame
ToDo:
Example: >>> import gensim >>> corpus = [[(1, 0.5)], []] >>> gensim.corpora.MmCorpus.serialize('/tmp/corpus.mm', corpus) >>> mm = gensim.corpora.MmCorpus('/tmp/corpus.mm') >>> type2id = {0 : "test", 1 : "corpus"} >>> doc_labels = ['doc1', 'doc2'] >>> model = gensim.models.LdaModel(corpus=mm, id2word=type2id, num_topics=1) >>> doc_topic = visualization.create_doc_topic(corpus, model, doc_labels) >>> len(doc_topic.T) == 2 >>> True """ no_of_topics = model.num_topics no_of_docs = len(doc_labels)
for doc, i in zip(corpus, range(no_of_docs)): # use document bow from corpus topic_dist = model.__getitem__(doc) # to get topic distribution from model for topic in topic_dist: # topic_dist is a list of tuples doc_topic[topic[0]][i] = topic[1] # save topic probability
topic_labels = [] for i in range(no_of_topics): topic_terms = [x[0] for x in model.show_topic(i, topn=3)] # show_topic() returns tuples (word_prob, word) topic_labels.append(" ".join(topic_terms))
doc_topic = pd.DataFrame(doc_topic, index = topic_labels, columns = doc_labels)
return doc_topic
# Adapted from code by Stefan Pernes and Allen Riddell """Plot documnet-topic distribution in a heat map.
Description: Use create_doc_topic() to generate a doc-topic
Args: data_frame (DataFrame): Document-topic-matrix.
Returns: Plot with Heatmap
ToDo:
Example: >>> import gensim >>> corpus = [[(1, 0.5)], []] >>> gensim.corpora.MmCorpus.serialize('/tmp/corpus.mm', corpus) >>> mm = gensim.corpora.MmCorpus('/tmp/corpus.mm') >>> type2id = {0 : "test", 1 : "corpus"} >>> doc_labels = ['doc1', 'doc2'] >>> model = gensim.models.LdaModel(corpus=mm, id2word=type2id, num_topics=1) >>> doc_topic = visualization.create_doc_topic(corpus, model, doc_labels) >>> plot = doc_topic_heatmap(doc_topic) >>> plot.get_fignumns() [1] """ data_frame = data_frame.sort_index() doc_labels = list(data_frame.index) topic_labels = list(data_frame) if len(doc_labels) > 20 or len(topic_labels) > 20: plt.figure(figsize=(20,20)) # if many items, enlarge figure plt.pcolor(data_frame, norm=None, cmap='Reds') plt.yticks(np.arange(data_frame.shape[0])+1.0, doc_labels) plt.xticks(np.arange(data_frame.shape[1])+0.5, topic_labels, rotation='90') plt.gca().invert_yaxis()
#plt.savefig(path+"/"+corpusname+"_heatmap.png") #, dpi=80) return plt
"""Plot topic disctribution in a document.
Description:
Args: Document-topic data frame. Index of the document to be shown.
Returns: Plot.
""" data = doc_topic[list(doc_topic)[document_index]].copy() data = data[data != 0] data = data.sort_values() values = list(data) labels = list(data.index)
plt.barh(range(len(values)), values, align = 'center', alpha=0.5)
# # Work in progress following # def topicwords_in_df(model): pattern = regex.compile(r'\p{L}+\p{P}?\p{L}+') topics = [] index = [] for n, topic in enumerate(model.show_topics()): topics.append(pattern.findall(topic[1])) index.append("Topic " + str(n+1)) df = pd.DataFrame(topics, index=index, columns=["Key " + str(x+1) for x in range(len(topics))]) return df
def show_wordle_for_topic(model, topic_nr, words): """Plot wordle for a specific topic
Args: model: Gensim LDA model topic_nr(int): Choose topic words (int): Number of words to show
Note: Function does use wordcloud package -> https://pypi.python.org/pypi/wordcloud pip install wordcloud
ToDo: Check if this function should be implemented
""" plt.figure() plt.imshow(WordCloud().fit_words(dict(model.show_topic(topic_nr, words)))) plt.axis("off") plt.title("Topic #" + str(topic_nr + 1)) return plt
""" Create color scheme for wordle.""" #return "hsl(0, 00%, %d%%)" % random.randint(80, 100) # Greys for black background. #return "hsl(221, 65%%, %d%%)" % random.randint(30, 35) # Dark blues for white background
def get_topicRank(topic, topicRanksFile): #print("getting topic rank.") with open(topicRanksFile, "r") as infile: return rank
def read_mallet_word_weights(word_weights_file): """Read Mallet word_weigths file
Description: Reads Mallet word_weigths into pandas DataFrame.
Args: word_weigts_file: Word_weights_file created with Mallet
Returns: Pandas DataFrame
Note:
ToDo:
""" word_scores = pd.read_table(word_weights_file, header=None, sep="\t") word_scores = word_scores.sort(columns=[0,2], axis=0, ascending=[True, False]) word_scores_grouped = word_scores.groupby(0) return word_scores_grouped
def _get_wordcloudwords(word_scores_grouped, number_of_top_words, topic_nr): """Transform Mallet output for wordcloud generation.
Description: Get words for wordcloud.
Args: word_scores_grouped(DataFrame): Uses read_mallet_word_weights() to get grouped word scores. topic_nr(int): Topic the wordcloud should be generated for number_of_top_words(int): Number of top words that should be considered
Returns: Words for wordcloud.
Note:
ToDo:
""" topic_word_scores = word_scores_grouped.get_group(topic_nr) top_topic_word_scores = topic_word_scores.iloc[0:number_of_top_words] topic_words = top_topic_word_scores.loc[:,1].tolist() #word_scores = top_topic_word_scores.loc[:,2].tolist() wordcloudwords = "" j = 0 #score = word_scores[j] wordcloudwords = wordcloudwords + ((word + " ")) return wordcloudwords
topic_nr, number_of_top_words, outfolder, dpi): """Generate wordclouds from Mallet output.
Description: This function does use the wordcloud module to plot wordclouds. Uses read_mallet_word_weigths() and get_wordlewords() to get word_scores and words for wordclouds.
Args: word_weigts_file: Word_weights_file created with Mallet topic_nr(int): Topic the wordclouds should be generated for number_of_top_words(int): Number of top words that should be considered for the wordclouds outfolder(str): Specify path to safe wordclouds. dpi(int): Set resolution for wordclouds.
Returns: Plot
Note:
ToDo:
"""
word_scores_grouped = read_mallet_word_weights(word_weights_file) text = _get_wordcloudwords(word_scores_grouped, number_of_top_words, topic_nr) figure_title = "topic "+ str(topic_nr) plt.imshow(default_colors) plt.title(figure_title, fontsize=30) plt.axis("off")
## Saving the image file. if not os.path.exists(outfolder):
figure_filename = "wordcloud_tp"+"{:03d}".format(topic_nr) + ".png"
topic_dist = model.topic_word_[topic_nr] topic_words = np.array(vocab)[np.argsort(topic_dist)][:-words:-1] token_value = {} for token, value in zip(topic_words, topic_dist[:-words:-1]):
"""Plot interactive doc_topic_heatmap
Description: With this function you can plot an interactive doc_topic matrix.
Args: doc_topic (DataFrame): Doc_topic matrix in a DataFrame title (str): Title shown in the plot.
Returns: bokeh plot
Note:
ToDo:
""" #from ipywidgets import interact ColumnDataSource, HoverTool, LinearColorMapper, BasicTicker, ColorBar )
'Topic': list(doc_topic.index) * len(doc_topic.columns), 'Document': [item for item in list(doc_topic.columns) for i in range(len(doc_topic.index))], 'Score': score }
x_range=documents, y_range=list(reversed(topics)), x_axis_location="above", plot_width=1024, plot_height=768, tools=TOOLS, toolbar_location='below', responsive=True)
source=source, fill_color={'field': 'Score', 'transform': mapper}, line_color=None)
ticker=BasicTicker(desired_num_ticks=len(colors)), label_standoff=6, border_line_color=None, location=(0, 0))
('Document', '@Document'), ('Topic', '@Topic'), ('Score', '@Score') ]
except: log.info("Bokeh could not be imported now using mathplotlib") doc_topic_heatmap(doc_topic)
p.add_layout(color_bar, 'right')
('Document', '@Document'), ('Topic', '@Topic'), ('Score', '@Score') ] return p
"""Creates a visualization that shows topics over time
Description: With this function you can plot topics over time using metadata stored in the documents name. Only works with mallet output.
Args: doc_topic: doc-topic matrix created by mallet.show_doc_topic_matrix labels(list[str]): first three keys in a topic to select threshold(float): threshold set to define if a topic in a document is viable starttime(int): sets starting point for visualization endtime(int): sets ending point for visualization
Returns: matplotlib plot
Note: this function is created for a corpus with filenames that looks like: 1866_ArticleName.txt
ToDo: make it compatible with gensim output
"""
years=list(range(starttime,endtime)) doc_topicT = doc_topic.T for label in labels: topic_over_threshold_per_year =[] df = doc_topicT.loc[doc_topicT[label] > threshold] d = defaultdict(int) for item in df.index.values: year = item.split('_') d[year[0]]+=1 for year in years: topic_over_threshold_per_year.append(d[str(year)]) plt.plot(years, topic_over_threshold_per_year, label=label)
plt.xlabel('Year') plt.ylabel('count topics over threshold') plt.legend() fig = plt.gcf() fig.set_size_inches(18.5, 10.5) plt.show()
|