Source code for dhlab.text.nb_search_text

from collections import Counter

import networkx as nx
import pandas as pd

from dhlab.api.nb_search_api import get_df, get_konks
from dhlab.legacy.nbtext import frame, frame_sort
from dhlab.text.nbtokenizer import tokenize


[docs] def phrase_plots( phrase_sets, title='aftenposten', fra=1960, til=2020, step=5, rot=0, colours=['r', 'b', 'g', 'y', 'm', 'c'] ): df_all = [] for f in phrase_sets: df_all.append(frame(get_df(f, title=title), ', '.join(f))) df = pd.concat(df_all, sort=False) df.index = df.index.astype(int) df = df.sort_index() df['bins'] = pd.cut(df.index, range(fra, til, step), precision=0) df.groupby('bins').sum().plot( kind='bar', color=colours, figsize=( 15, 5), rot=rot) return
[docs] def phrase_plots_anno( phrase_sets, title='aftenposten', fra=1960, til=2020, rot=0, colours=['r', 'b', 'g'] ): df_all = [] for f in phrase_sets: df_all.append(frame(get_df(f, title=title), ', '.join(f))) df = pd.concat(df_all, sort=False) df.index = df.index.astype(int) df = df.sort_index() #df['bins'] = pd.cut(df.index, range(fra, til, step), precision=0) df.plot(kind='bar', figsize=(15, 5), rot=rot, color=colours) return
[docs] def graph_from_df(df, threshold=100): edges = [] normalizer = {(x, y): df.stack()[(x, x)] * df.stack()[(y, y)] for (x, y) in df.stack().index} for (x, y) in df.stack().index: if x != y: if df.stack()[(x, y)] > threshold: edges.append([x, y, df.stack()[(x, y)] / normalizer[(x, y)]]) G = nx.Graph() G.add_weighted_edges_from(edges) return G
[docs] def get_all_konks(term, urns): konks = [] for u in urns: konks += get_konks(u, term) return konks
[docs] def collocations_from_nb(word, corpus): """Get a concordance, and count the words in it. Assume konks reside a dataframe with columns 'after' and 'before'""" concordance = frame(get_all_konks(word, corpus)) return frame_sort(frame(Counter(tokenize( ' '.join(concordance['after'].values + concordance['before'].values))), word))
[docs] def count_from_conc(concordance): """From a concordance, count the words in it. Assume konks reside a dataframe with columns 'after' and 'before'""" word = concordance['word'][0] return frame_sort( frame( Counter( tokenize( ' '.join( concordance['after'].values + concordance['before'].values)) ), word ) )