Source code for dhlab.ngram.nb_ngram

import pandas as pd

from dhlab.api.nb_ngram_api import get_ngram


[docs] def nb_ngram(terms: str, corpus: str = 'bok', smooth: int = 1, years: tuple = (1810, 2010), mode: str = 'relative', lang: str = 'nob'): """Extract N-gram frequencies from given ``terms`` and ``years``. `lang` param is not supported for corpus=`avis` and will be set to None if `avis` is passed. :param terms: comma :param corpus: :param smooth: :param years: :param mode: :param lang: :return: A sorted Pandas DataFrame index :meta private: """ # Set default lang for 'bok'-corpus if corpus == "avis": lang = None df = ngram_conv(get_ngram(terms, corpus=corpus, lang = lang), smooth=smooth, years=years, mode=mode) df.index = df.index.astype(int) return df.sort_index()
## tar tilbake til original den her virker ikke LGJ
[docs] def ngram_conv_old(ngrams, smooth: int = 1, years: tuple = (1810, 2013), mode: str = 'relative'): """Construct a dataframe with ngram mean frequencies per year over a given time period. :param ngrams: TODO: FIll in appropriate type and description. :param smooth: Smoothing factor for the graph visualisation. :param years: Tuple with start and end years for the time period of interest :param mode: Frequency measure. Defaults to 'relative'. :return: pandas dataframe with mean values for each year :meta private: """ ngc = {} # check if relative frequency or absolute frequency is in question if mode.startswith('rel') or mode == 'y': arg = 'y' else: arg = 'f' for x in ngrams: if x and isinstance(x, list): ngc[x['key']] = { z['x']: z[arg] for z in x['values'] if years[1] >= int(z['x']) >= years[0] } return pd.DataFrame(ngc).rolling(window=smooth, win_type='triang').mean()
[docs] def ngram_conv(ngrams, smooth=1, years=(1810,2013), mode='relative'): """Construct a dataframe with ngram mean frequencies per year over a given time period. :param ngrams: TODO: FIll in appropriate type and description. :param smooth: Smoothing factor for the graph visualisation. :param years: Tuple with start and end years for the time period of interest :param mode: Frequency measure. Defaults to 'relative'. :return: pandas dataframe with mean values for each year :meta private: """ ngc = {} # check if relative frequency or absolute frequency is in question if mode.startswith('rel') or mode=='y': arg = 'y' else: arg = 'f' for x in ngrams: # check if x is a non empty ngram - empty ngrams are coded as empty lists # if x is non emtpy it accepts keys - look at alternative isinstance(x, dict)? if x != []: ngc[x['key']] \ = \ {z['x']:z[arg] for z in x['values'] \ if int(z['x']) <= int(years[1]) and int(z['x']) >= int(years[0])} return pd.DataFrame(ngc).rolling(window=smooth, win_type='triang').mean()