Coverage for dariah_topics/visualization.py: 44%

Hide keyboard shortcuts

Hot-keys on this page

r m x p toggle line displays

j k next/prev highlighted chunk

0 (zero) top of page

1 (one) first highlighted chunk

# -*- coding: utf-8 -*-

"""

Visualizing the Output of LDA Models

====================================

"""

__author__ = "DARIAH-DE"

__authors__ = "Steffen Pielstroem, Sina Bock, Severin Simmler"

__email__ = "pielstroem@biozentrum.uni-wuerzburg.de"

__version__ = "0.1"

__date__ = "2017-01-20"

import logging

import matplotlib

matplotlib.use('Agg')

import matplotlib.pyplot as plt

import numpy as np

import os

import pandas as pd

import regex

from collections import defaultdict

log = logging.getLogger('visualization')

log.addHandler(logging.NullHandler())

logging.basicConfig(level = logging.ERROR,

format = '%(levelname)s %(name)s: %(message)s')

def create_doc_topic(corpus, model, doc_labels):

# Adapted from code by Stefan Pernes

"""Creates a document-topic-matrix.

Description:

With this function you can create a doc-topic-maxtrix for gensim

output.

Args:

corpus (mmCorpus): Gensim corpus.

model: Gensim LDA model

doc_labels (list): List of document labels.

Returns:

Doc_topic-matrix as DataFrame

ToDo:

Example:

>>> import gensim

>>> corpus = [[(1, 0.5)], []]

>>> gensim.corpora.MmCorpus.serialize('/tmp/corpus.mm', corpus)

>>> mm = gensim.corpora.MmCorpus('/tmp/corpus.mm')

>>> type2id = {0 : "test", 1 : "corpus"}

>>> doc_labels = ['doc1', 'doc2']

>>> model = gensim.models.LdaModel(corpus=mm, id2word=type2id, num_topics=1)

>>> doc_topic = visualization.create_doc_topic(corpus, model, doc_labels)

>>> len(doc_topic.T) == 2

>>> True

"""

no_of_topics = model.num_topics

no_of_docs = len(doc_labels)

doc_topic = np.zeros((no_of_topics, no_of_docs))

for doc, i in zip(corpus, range(no_of_docs)): # use document bow from corpus

topic_dist = model.__getitem__(doc) # to get topic distribution from model

for topic in topic_dist: # topic_dist is a list of tuples

doc_topic[topic[0]][i] = topic[1] # save topic probability

topic_labels = []

for i in range(no_of_topics):

topic_terms = [x[0] for x in model.show_topic(i, topn=3)] # show_topic() returns tuples (word_prob, word)

topic_labels.append(" ".join(topic_terms))

doc_topic = pd.DataFrame(doc_topic, index = topic_labels, columns = doc_labels)

return doc_topic

def doc_topic_heatmap(data_frame):

# Adapted from code by Stefan Pernes and Allen Riddell

"""Plot documnet-topic distribution in a heat map.

Description:

Use create_doc_topic() to generate a doc-topic

Args:

data_frame (DataFrame): Document-topic-matrix.

Returns:

Plot with Heatmap

ToDo:

Example:

>>> import gensim

>>> corpus = [[(1, 0.5)], []]

>>> gensim.corpora.MmCorpus.serialize('/tmp/corpus.mm', corpus)

>>> mm = gensim.corpora.MmCorpus('/tmp/corpus.mm')

>>> type2id = {0 : "test", 1 : "corpus"}

>>> doc_labels = ['doc1', 'doc2']

>>> model = gensim.models.LdaModel(corpus=mm, id2word=type2id, num_topics=1)

>>> doc_topic = visualization.create_doc_topic(corpus, model, doc_labels)

>>> plot = doc_topic_heatmap(doc_topic)

>>> plot.get_fignumns()

[1]

"""

data_frame = data_frame.sort_index()

doc_labels = list(data_frame.index)

topic_labels = list(data_frame)

if len(doc_labels) > 20 or len(topic_labels) > 20: plt.figure(figsize=(20,20)) # if many items, enlarge figure

plt.pcolor(data_frame, norm=None, cmap='Reds')

plt.yticks(np.arange(data_frame.shape[0])+1.0, doc_labels)

plt.xticks(np.arange(data_frame.shape[1])+0.5, topic_labels, rotation='90')

plt.gca().invert_yaxis()

plt.tight_layout()

#plt.savefig(path+"/"+corpusname+"_heatmap.png") #, dpi=80)

return plt

def plot_doc_topics(doc_topic, document_index):

"""Plot topic disctribution in a document.

Description:

Args:

Document-topic data frame.

Index of the document to be shown.

Returns:

Plot.

"""

data = doc_topic[list(doc_topic)[document_index]].copy()

data = data[data != 0]

data = data.sort_values()

values = list(data)

labels = list(data.index)

plt.barh(range(len(values)), values, align = 'center', alpha=0.5)

plt.yticks(range(len(values)), labels)

plt.title(list(doc_topic)[document_index])

plt.xlabel('Proportion')

plt.ylabel('Topic')

plt.tight_layout()

return plt

try:

from wordcloud import WordCloud

#

# Work in progress following

#

def topicwords_in_df(model):

pattern = regex.compile(r'\p{L}+\p{P}?\p{L}+')

topics = []

index = []

for n, topic in enumerate(model.show_topics()):

topics.append(pattern.findall(topic[1]))

index.append("Topic " + str(n+1))

df = pd.DataFrame(topics, index=index, columns=["Key " + str(x+1) for x in range(len(topics))])

return df

def show_wordle_for_topic(model, topic_nr, words):

"""Plot wordle for a specific topic

Args:

model: Gensim LDA model

topic_nr(int): Choose topic

words (int): Number of words to show

Note: Function does use wordcloud package -> https://pypi.python.org/pypi/wordcloud

pip install wordcloud

ToDo: Check if this function should be implemented

"""

plt.figure()

plt.imshow(WordCloud().fit_words(dict(model.show_topic(topic_nr, words))))

plt.axis("off")

plt.title("Topic #" + str(topic_nr + 1))

return plt

def get_color_scale(word, font_size, position, orientation, font_path, random_state=None):

""" Create color scheme for wordle."""

return "hsl(245, 58%, 25%)" # Default. Uniform dark blue.

#return "hsl(0, 00%, %d%%)" % random.randint(80, 100) # Greys for black background.

#return "hsl(221, 65%%, %d%%)" % random.randint(30, 35) # Dark blues for white background

def get_topicRank(topic, topicRanksFile):

#print("getting topic rank.")

with open(topicRanksFile, "r") as infile:

topicRanks = pd.read_csv(infile, sep=",", index_col=0)

rank = int(topicRanks.iloc[topic]["Rank"])

return rank

def read_mallet_word_weights(word_weights_file):

"""Read Mallet word_weigths file

Description:

Reads Mallet word_weigths into pandas DataFrame.

Args:

word_weigts_file: Word_weights_file created with Mallet

Returns: Pandas DataFrame

Note:

ToDo:

"""

word_scores = pd.read_table(word_weights_file, header=None, sep="\t")

word_scores = word_scores.sort(columns=[0,2], axis=0, ascending=[True, False])

word_scores_grouped = word_scores.groupby(0)

return word_scores_grouped

def _get_wordcloudwords(word_scores_grouped, number_of_top_words, topic_nr):

"""Transform Mallet output for wordcloud generation.

Description:

Get words for wordcloud.

Args:

word_scores_grouped(DataFrame): Uses read_mallet_word_weights() to get

grouped word scores.

topic_nr(int): Topic the wordcloud should be generated for

number_of_top_words(int): Number of top words that should be considered

Returns: Words for wordcloud.

Note:

ToDo:

"""

topic_word_scores = word_scores_grouped.get_group(topic_nr)

top_topic_word_scores = topic_word_scores.iloc[0:number_of_top_words]

topic_words = top_topic_word_scores.loc[:,1].tolist()

#word_scores = top_topic_word_scores.loc[:,2].tolist()

wordcloudwords = ""

j = 0

for word in topic_words:

word = word

#score = word_scores[j]

j += 1

wordcloudwords = wordcloudwords + ((word + " "))

return wordcloudwords

def plot_wordcloud_from_mallet(word_weights_file,

topic_nr,

number_of_top_words,

outfolder,

dpi):

"""Generate wordclouds from Mallet output.

Description:

This function does use the wordcloud module to plot wordclouds.

Uses read_mallet_word_weigths() and get_wordlewords() to get

word_scores and words for wordclouds.

Args:

word_weigts_file: Word_weights_file created with Mallet

topic_nr(int): Topic the wordclouds should be generated for

number_of_top_words(int): Number of top words that should be considered

for the wordclouds

outfolder(str): Specify path to safe wordclouds.

dpi(int): Set resolution for wordclouds.

Returns: Plot

Note:

ToDo:

"""

word_scores_grouped = read_mallet_word_weights(word_weights_file)

text = _get_wordcloudwords(word_scores_grouped, number_of_top_words, topic_nr)

wordcloud = WordCloud(width=600, height=400, background_color="white", margin=4).generate(text)

default_colors = wordcloud.to_array()

figure_title = "topic "+ str(topic_nr)

plt.imshow(default_colors)

plt.imshow(wordcloud)

plt.title(figure_title, fontsize=30)

plt.axis("off")

## Saving the image file.

if not os.path.exists(outfolder):

os.makedirs(outfolder)

figure_filename = "wordcloud_tp"+"{:03d}".format(topic_nr) + ".png"

plt.savefig(outfolder + figure_filename, dpi=dpi)

return plt

def plot_wordle_from_lda(model, vocab, topic_nr, words, height, width):

topic_dist = model.topic_word_[topic_nr]

topic_words = np.array(vocab)[np.argsort(topic_dist)][:-words:-1]

token_value = {}

for token, value in zip(topic_words, topic_dist[:-words:-1]):

token_value.update({token: value})

return WordCloud(background_color='white', height=height, width=width).fit_words(token_value)

except ImportError as e:

log.info('WordCloud functions not available, they require the wordcloud module')

def doc_topic_heatmap_interactive(doc_topic, title):

"""Plot interactive doc_topic_heatmap

Description:

With this function you can plot an interactive doc_topic matrix.

Args:

doc_topic (DataFrame): Doc_topic matrix in a DataFrame

title (str): Title shown in the plot.

Returns: bokeh plot

Note:

ToDo:

"""

log.info("Importing functions from bokeh ...")

try:

#from ipywidgets import interact

from bokeh.io import output_notebook

from bokeh.plotting import figure

from math import pi

from bokeh.models import (

ColumnDataSource,

HoverTool,

LinearColorMapper,

BasicTicker,

ColorBar

)

output_notebook()

documents = list(doc_topic.columns)

topics = doc_topic.index

score = []

for x in doc_topic.apply(tuple):

score.extend(x)

data = {

'Topic': list(doc_topic.index) * len(doc_topic.columns),

'Document': [item for item in list(doc_topic.columns) for i in range(len(doc_topic.index))],

'Score': score

}

df = doc_topic.from_dict(data)

colors = ["#c6dbef", "#9ecae1", "#6baed6", "#4292c6", "#2171b5", "#08519c", "#08306b"]

mapper = LinearColorMapper(palette=colors, low=df.Score.min(), high=df.Score.max())

source = ColumnDataSource(df)

TOOLS = "hover,save,pan,box_zoom,reset,wheel_zoom"

p = figure(title=title,

x_range=documents, y_range=list(reversed(topics)),

x_axis_location="above", plot_width=1024, plot_height=768,

tools=TOOLS, toolbar_location='below', responsive=True)

p.grid.grid_line_color = None

p.axis.axis_line_color = None

p.axis.major_tick_line_color = None

p.axis.major_label_text_font_size = "9pt"

p.axis.major_label_standoff = 0

p.xaxis.major_label_orientation = pi / 3

p.rect(x="Document", y="Topic", width=1, height=1,

source=source,

fill_color={'field': 'Score', 'transform': mapper},

line_color=None)

color_bar = ColorBar(color_mapper=mapper, major_label_text_font_size="10pt",

ticker=BasicTicker(desired_num_ticks=len(colors)),

label_standoff=6, border_line_color=None, location=(0, 0))

p.add_layout(color_bar, 'right')

p.select_one(HoverTool).tooltips = [

('Document', '@Document'),

('Topic', '@Topic'),

('Score', '@Score')

]

return p

except:

log.info("Bokeh could not be imported now using mathplotlib")

doc_topic_heatmap(doc_topic)

p.add_layout(color_bar, 'right')

p.select_one(HoverTool).tooltips = [

('Document', '@Document'),

('Topic', '@Topic'),

('Score', '@Score')

]

return p

def show_topic_over_time(doc_topic, labels=['armee truppen general', 'regierung preußen partei', 'dichter goethe kunst'], threshold=0.1, starttime=1841, endtime=1920):

"""Creates a visualization that shows topics over time

Description:

With this function you can plot topics over time using metadata stored in the documents name.

Only works with mallet output.

Args:

doc_topic: doc-topic matrix created by mallet.show_doc_topic_matrix

labels(list[str]): first three keys in a topic to select

threshold(float): threshold set to define if a topic in a document is viable

starttime(int): sets starting point for visualization

endtime(int): sets ending point for visualization

Returns: matplotlib plot

Note: this function is created for a corpus with filenames that looks like:

1866_ArticleName.txt

ToDo: make it compatible with gensim output

"""

years=list(range(starttime,endtime))

doc_topicT = doc_topic.T

for label in labels:

topic_over_threshold_per_year =[]

df = doc_topicT.loc[doc_topicT[label] > threshold]

d = defaultdict(int)

for item in df.index.values:

year = item.split('_')

d[year[0]]+=1

for year in years:

topic_over_threshold_per_year.append(d[str(year)])

plt.plot(years, topic_over_threshold_per_year, label=label)

plt.xlabel('Year')

plt.ylabel('count topics over threshold')

plt.legend()

fig = plt.gcf()

fig.set_size_inches(18.5, 10.5)

plt.show()