from io import StringIO
from typing import Dict, List, Tuple, Union
import pandas as pd
import requests
# from requests import HTTPError, JSONDecodeError, ConnectionError
from pandas import DataFrame, Series
from dhlab.constants import BASE_URL
pd.options.display.max_rows = 100
# wildcard search for words
[docs]
def wildcard_search(word, factor=2, freq_limit=10, limit = 50):
res = requests.get(f"{BASE_URL}/wildcard_word_search",
params={
'word':word,
'factor':factor,
'freq_lim':freq_limit,
'limit':limit
}
)
#columns = ["key", "name", "alternatename", "latitude", "longitude", "feature class", "feature code"]
return pd.DataFrame.from_dict(res.json(), orient = 'index', columns=['freq'])
# fetch metadata
[docs]
def images(text = None, part=True):
""" Retrive images from bokhylla
:param text: fulltext query expression for sqlite
:param part: if a number the whole page is shown
... bug prevents these from going thru
:param delta: if part=True then show additional pixels around image
:parsm hits: number of images"""
params = locals()
r = requests.get(f"{BASE_URL}/images", params=params)
js = r.json()
return js
[docs]
def ner_from_urn(urn: str = None, model: str = None, start_page = 0, to_page = 0) -> DataFrame:
"""Get NER annotations for a text (``urn``) using a spacy ``model``.
:param str urn: uniform resource name, example: ``URN:NBN:no-nb_digibok_2011051112001``
:param str model: name of a spacy model.
Check which models are available with :func:`show_spacy_models`
:return: Dataframe with annotations and their frequencies
"""
params = locals()
r = requests.get(f"{BASE_URL}/ner_urn", params=params)
df = pd.read_json(r.json())
return df
[docs]
def pos_from_urn(urn: str = None, model: str = None, start_page = 0, to_page = 0) -> DataFrame:
"""Get part of speech tags and dependency parse annotations for a text (``urn``) with a SpaCy ``model``.
:param str urn: uniform resource name, example: ``URN:NBN:no-nb_digibok_2011051112001``
:param str model: name of a spacy model.
Check which models are available with :func:`show_spacy_models`
:return: Dataframe with annotations and their frequencies
"""
params = locals()
r = requests.get(f"{BASE_URL}/pos_urn", params=params)
df = pd.read_json(r.json())
return df
[docs]
def show_spacy_models() -> List:
"""Show available SpaCy model names."""
try:
r = requests.get(f"{BASE_URL}/ner_models")
#r.raise_for_status()
res = r.json()
except: #(HTTPError, JSONDecodeError, ConnectionError) as error:
#print(error.__doc__, error)
print("Server-request gikk ikke gjennom. Kan ikke vise SpaCy-modellnavn.")
res = []
return res
[docs]
def get_places(urn: str) -> DataFrame:
"""Look up placenames in a specific URN.
Call the API :py:obj:`~dhlab.constants.BASE_URL` endpoint
`/places <https://api.nb.no/dhlab/#/default/post_places>`_.
:param str urn: uniform resource name, example: ``URN:NBN:no-nb_digibok_2011051112001``
"""
params = locals()
r = requests.post(f"{BASE_URL}/places", json=params)
# print(r.status_code)
return pd.DataFrame(r.json())
[docs]
def geo_lookup(
places: List,
feature_class: str = None,
feature_code: str = None,
field: str = "alternatename",
) -> DataFrame:
"""From a list of places, return their geolocations
:param list places: a list of place names - max 1000
:param str feature_class: which GeoNames feature class to return. Example: ``P``
:param str feature_code: which GeoNames feature code to return. Example: ``PPL``
:param str field: which name field to match - default "alternatename".
"""
res = requests.post(
f"{BASE_URL}/geo_data",
json={
"words": places,
"feature_class": feature_class,
"feature_code": feature_code,
"field": field,
},
)
columns = [
"geonameid",
"name",
"alternatename",
"latitude",
"longitude",
"feature_class",
"feature_code",
]
return pd.DataFrame(res.json(), columns=columns)
[docs]
def get_dispersion(
urn: str = None,
words: List = None,
window: int = 300,
pr: int = 100,
) -> Series:
"""Count occurrences of words in the given URN object.
Call the API :py:obj:`~dhlab.constants.BASE_URL` endpoint ``/dispersion``.
:param str urn: uniform resource name, example: ``URN:NBN:no-nb_digibok_2011051112001``
:param list words: list of words. Defaults to a list of punctuation marks.
:param int window: The number of tokens to search through per row. Defaults to 300.
:param int pr: defaults to 100.
:return: a ``pandas.Series`` with frequency counts of the words in the URN object.
"""
params = locals()
r = requests.post(f"{BASE_URL}/dispersion", json=params)
return pd.Series(r.json())
[docs]
def get_identifiers(identifiers: list = None) -> list:
"""Convert a list of identifiers, oaiid, sesamid, urns or isbn10 to dhlabids"""
res = requests.post(f"{BASE_URL}/identifiers", json={'identifiers':[i for i in identifiers if i != '']})
return res.json()
[docs]
def get_chunks(urn: str = None, chunk_size: int = 300) -> Union[Dict, List]:
"""Get the text in the document ``urn`` as frequencies of chunks
of the given ``chunk_size``.
Calls the API :py:obj:`~dhlab.constants.BASE_URL` endpoint
``/chunks``.
:param str urn: uniform resource name, example: ``URN:NBN:no-nb_digibok_2011051112001``
:param int chunk_size: Number of tokens to include in each chunk.
:return: list of dicts with the resulting chunk frequencies, or an empty dict
"""
if urn is None:
return {}
r = requests.get(f"{BASE_URL}/chunks", params=locals())
if r.status_code == 200:
result = r.json()
else:
result = {}
return result
[docs]
def get_chunks_para(urn: str = None) -> Union[Dict, List]:
"""Fetch chunks and their frequencies from paragraphs in a document (``urn``).
Calls the API :py:obj:`~dhlab.constants.BASE_URL` endpoint
``/chunks_para``.
:param str urn: uniform resource name, example: ``URN:NBN:no-nb_digibok_2011051112001``
:return: list of dicts with the resulting chunk frequencies, or an empty dict
"""
if urn is None:
return {}
r = requests.get(f"{BASE_URL}/chunks_para", params=locals())
if r.status_code == 200:
result = r.json()
else:
result = {}
return result
[docs]
def evaluate_documents(wordbags: Dict = None, urns: List[str] = None) -> DataFrame:
"""Count and aggregate occurrences of topic ``wordbags`` for each document in a list of ``urns``.
:param dict wordbags: a dictionary of topic keywords and lists of associated words.
Example: ``{"natur": ["planter", "skog", "fjell", "fjord"], ... }``
:param list urns: uniform resource names, for example:
``["URN:NBN:no-nb_digibok_2008051404065", "URN:NBN:no-nb_digibok_2010092120011"]``
:return: a ``pandas.DataFrame`` with the topics as columns, indexed by the dhlabids of the
documents.
"""
res = requests.post(
f"{BASE_URL}/evaluate", json={"wordbags": wordbags, "urns": urns}
)
df = pd.DataFrame(res.json()).transpose()
return df
[docs]
def get_reference(
corpus: str = "digavis",
from_year: int = 1950,
to_year: int = 1955,
lang: str = "nob",
limit: int = 100000,
) -> DataFrame:
"""Reference frequency list of the n most frequent words from a given corpus in a given period.
Call the API :py:obj:`~dhlab.constants.BASE_URL` endpoint
`/reference_corpus <https://api.nb.no/dhlab/#/default/get_reference_corpus>`_.
:param str corpus: Document type to include in the corpus, can be either ``'digibok'`` or
``'digavis'``.
:param int from_year: Starting point for time period of the corpus.
:param int to_year: Last year of the time period of the corpus.
:param str lang: Language of the corpus, can be one of
``'nob,', 'nno,', 'sme,', 'sma,', 'smj', 'fkv'``
:param int limit: Maximum number of most frequent words.
:return: A ``pandas.DataFrame`` with the results.
"""
params = locals()
r = requests.get(BASE_URL + "/reference_corpus", params=params)
if r.status_code == 200:
result = r.json()
else:
result = []
return pd.DataFrame(result, columns=["word", "freq"]).set_index("word")
[docs]
def find_urns(docids: Union[Dict, DataFrame] = None, mode: str = "json") -> DataFrame:
"""Return a list of URNs from a collection of docids.
Call the API :py:obj:`~dhlab.constants.BASE_URL` endpoint
`/find_urn`.
:param docids: dictionary of document IDs (``{docid: URN}``) or a ``pandas.DataFrame``.
:param str mode: Default 'json'.
:return: the URNs that were found, in a ``pandas.DataFrame``.
"""
params = locals()
r = requests.post(BASE_URL + "/find_urn", json=params)
if r.status_code == 200:
res = pd.DataFrame.from_dict(r.json(), orient="index", columns=["urn"])
else:
res = pd.DataFrame()
return res
def _ngram_doc(
doctype: str = None,
word: Union[List, str] = ["."],
title: str = None,
period: Tuple[int, int] = None,
publisher: str = None,
lang: str = None,
city: str = None,
ddk: str = None,
topic: str = None,
) -> DataFrame:
"""Count occurrences of one or more words over a time period.
The type of document to search through is decided by the ``doctype``.
Filter the selection of documents with metadata.
Use % as wildcard where appropriate - no wildcards in ``word`` or ``lang``.
:param str doctype: API endpoint for the document type to get ngrams for.
Can be ``'book'``, ``'periodicals'``, or ``'newspapers'``.
:param word: Word(s) to search for.
Can be several words in a single string, separated by comma, e.g. ``"ord,ordene,orda"``.
:type word: str or list of str
:param str title: Title of a specific document to search through.
:param period: Start and end years or dates of a time period,
given as ``(YYYY, YYYY)`` or ``(YYYYMMDD, YYYYMMDD)``.
:type period: tuple of ints
:param str publisher: Name of a publisher.
:param str lang: Language as a 3-letter ISO code (e.g. ``"nob"`` or ``"nno"``)
:param str city: City of publication.
:param str ddk: `Dewey Decimal Classification
<https://no.wikipedia.org/wiki/Deweys_desimalklassifikasjon>`_ identifier.
:param str topic: Topic of the documents.
:return: a ``pandas.DataFrame`` with the resulting frequency counts of the word(s),
spread across years. One year per row.
"""
params = locals()
if isinstance(word, str):
# assume a comma separated string
word = [w.strip() for w in word.split(",")]
params["word"] = tuple(word)
params = {x: params[x] for x in params if not params[x] is None}
r = requests.post(BASE_URL + "/ngram_" + doctype, json=params)
# print(r.status_code)
df = pd.DataFrame.from_dict(r.json(), orient="index")
df.index = df.index.map(lambda x: tuple(x.split()))
columns = df.index.levels[0]
df = pd.concat([df.loc[x] for x in columns], axis=1)
df.columns = columns
# df.index = df.index.map(pd.Timestamp)
return df
[docs]
def reference_words(
words: List = None,
doctype: str = "digibok",
from_year: Union[str, int] = 1800,
to_year: Union[str, int] = 2000,
) -> DataFrame:
"""Collect reference data for a list of words over a time period.
Reference data are the absolute and relative frequencies of the ``words``
across all documents of the given ``doctype`` in the given time period
(``from_year`` - ``to_year``).
:param list words: list of word strings.
:param str doctype: type of reference document. Can be ``"digibok"`` or ``"digavis"``.
Defaults to ``"digibok"``.
.. note::
If any other string is given as the ``doctype``,
the resulting data is equivalent to what you get with
``doctype="digavis"``.
:param int from_year: first year of publication
:param int to_year: last year of publication
:return: a DataFrame with the words' frequency data
"""
params = locals()
r = requests.post(f"{BASE_URL}/reference_words", json=params)
print(r.status_code, BASE_URL)
if r.status_code == 200:
res = pd.DataFrame(r.json(), columns=["word", "freq", "relative"])
else:
res = pd.DataFrame()
return res
def _ngram_doc(
doctype: str = None,
word: Union[List, str] = ['.'],
title: str = None,
period: Tuple[int, int] = None,
publisher: str = None,
lang: str = None,
city: str = None,
ddk: str = None,
topic: str = None
) -> pd.DataFrame:
"""Count occurrences of one or more words over a time period.
The type of document to search through is decided by the ``endpoint``.
Filter the selection of documents with metadata.
Use % as wildcard where appropriate - no wildcards in ``word`` or ``lang``.
:param str doctype: API endpoint for the document type to get ngrams for.
Can be ``'book'``, ``'periodicals'``, or ``'newspapers'``.
:param word: Word(s) to search for.
Can be several words in a single string, separated by comma.
:type word: str or list of str
:param title: Title of a specific document to search through.
:param tuple[int,int] period: Start and end years of a time period,
given as ``(start year, end year)``.
:param str publisher: Name of a publisher.
:param str lang: Language as a 3-letter ISO code (e.g. ``"nob"`` or ``"nno"``)
:param str city: City of publication.
:param str ddk: `Dewey Decimal Classification
<https://no.wikipedia.org/wiki/Deweys_desimalklassifikasjon>`_ identifier.
:param str topic: Topic of the documents.
:return: a `pandas.DataFrame` with the resulting frequency counts of the word(s),
spread across years. One year per row.
"""
params = locals()
if isinstance(word, str):
# assume a comma separated string
word = [w.strip() for w in word.split(',')]
params['word'] = tuple(word)
params = {x: params[x] for x in params if not params[x] is None}
r = requests.post(BASE_URL + "/ngram_" + doctype, json=params)
# print(r.status_code)
df = pd.DataFrame.from_dict(r.json(), orient='index')
df.index = df.index.map(lambda x: tuple(x.split()))
columns = df.index.levels[0]
df = pd.concat([df.loc[x] for x in columns], axis=1)
df.columns = columns
# df.index = df.index.map(pd.Timestamp)
return df
# @_docstring_parameters_from(_ngram_doc, drop="doctype")
[docs]
def ngram_book(
word: Union[List, str] = ["."],
title: str = None,
period: Tuple[int, int] = None,
publisher: str = None,
lang: str = None,
city: str = None,
ddk: str = None,
topic: str = None,
) -> DataFrame:
"""Count occurrences of one or more words in books over a given time period.
Call the API :py:obj:`~dhlab.constants.BASE_URL` endpoint
`/ngram_book`.
Filter the selection of books with metadata.
Use % as wildcard where appropriate - no wildcards in ``word`` or ``lang``.
:param word: Word(s) to search for.
Can be several words in a single string, separated by comma, e.g. ``"ord,ordene,orda"``.
:type word: str or list of str
:param str title: Title of a specific document to search through.
:param period: Start and end years or dates of a time period,
given as ``(YYYY, YYYY)`` or ``(YYYYMMDD, YYYYMMDD)``.
:type period: tuple of ints
:param str publisher: Name of a publisher.
:param str lang: Language as a 3-letter ISO code (e.g. ``"nob"`` or ``"nno"``)
:param str city: City of publication.
:param str ddk: `Dewey Decimal Classification
<https://no.wikipedia.org/wiki/Deweys_desimalklassifikasjon>`_ identifier.
:param str topic: Topic of the documents.
:return: a ``pandas.DataFrame`` with the resulting frequency counts of the word(s),
spread across years. One year per row.
"""
params = locals()
if isinstance(word, str):
# assume a comma separated string
word = [w.strip() for w in word.split(',')]
params['word'] = tuple(word)
params = {x: params[x] for x in params if not params[x] is None}
r = requests.post(BASE_URL + "/ngram_book", json=params)
# print(r.status_code)
df = pd.DataFrame.from_dict(r.json(), orient='index')
df.index = df.index.map(lambda x: tuple(x.split()))
columns = df.index.levels[0]
df = pd.concat([df.loc[x] for x in columns], axis=1)
df.columns = columns
# df.index = df.index.map(pd.Timestamp)
return df
# @_docstring_parameters_from(_ngram_doc, drop="doctype")
[docs]
def ngram_periodicals(
word: Union[List, str] = ["."],
title: str = None,
period: Tuple[int, int] = None,
publisher: str = None,
lang: str = None,
city: str = None,
ddk: str = None,
topic: str = None,
**kwargs,
) -> DataFrame:
"""Get a time series of frequency counts for ``word`` in periodicals.
Call the API :py:obj:`~dhlab.constants.BASE_URL` endpoint
`/ngram_periodicals`.
:param word: Word(s) to search for.
Can be several words in a single string, separated by comma, e.g. ``"ord,ordene,orda"``.
:type word: str or list of str
:param str title: Title of a specific document to search through.
:param period: Start and end years or dates of a time period,
given as ``(YYYY, YYYY)`` or ``(YYYYMMDD, YYYYMMDD)``.
:type period: tuple of ints
:param str publisher: Name of a publisher.
:param str lang: Language as a 3-letter ISO code (e.g. ``"nob"`` or ``"nno"``)
:param str city: City of publication.
:param str ddk: `Dewey Decimal Classification
<https://no.wikipedia.org/wiki/Deweys_desimalklassifikasjon>`_ identifier.
:param str topic: Topic of the documents.
:return: a ``pandas.DataFrame`` with the resulting frequency counts of the word(s),
spread across years. One year per row.
"""
params = locals()
if isinstance(word, str):
# assume a comma separated string
word = [w.strip() for w in word.split(',')]
params['word'] = tuple(word)
params = {x: params[x] for x in params if not params[x] is None}
r = requests.post(BASE_URL + "/ngram_periodicals", json=params)
# print(r.status_code)
df = pd.DataFrame.from_dict(r.json(), orient='index')
df.index = df.index.map(lambda x: tuple(x.split()))
columns = df.index.levels[0]
df = pd.concat([df.loc[x] for x in columns], axis=1)
df.columns = columns
# df.index = df.index.map(pd.Timestamp)
return df
[docs]
def ngram_news(
word: Union[List, str] = ["."],
title: str = None,
period: Tuple[int, int] = None,
) -> DataFrame:
"""Get a time series of frequency counts for ``word`` in newspapers.
Call the API :py:obj:`~dhlab.constants.BASE_URL` endpoint
`/ngram_newspapers`.
:param word: Word(s) to search for.
Can be several words in a single string, separated by comma, e.g. ``"ord,ordene,orda"``.
:type word: str or list of str
:param str title: Title of a specific newspaper to search through.
:param period: Start and end years or dates of a time period,
given as ``(YYYY, YYYY)`` or ``(YYYYMMDD, YYYYMMDD)``.
:type period: tuple of ints
:return: a ``pandas.DataFrame`` with the resulting frequency counts of the word(s),
spread across the dates given in the time period. Either one year or one day per row.
"""
params = locals()
if isinstance(word, str):
# assume a comma separated string
word = [w.strip() for w in word.split(',')]
params['word'] = tuple(word)
params = {x: params[x] for x in params if not params[x] is None}
r = requests.post(BASE_URL + "/ngram_newspapers", json=params)
# print(r.status_code)
df = pd.DataFrame.from_dict(r.json(), orient='index')
df.index = df.index.map(lambda x: tuple(x.split()))
columns = df.index.levels[0]
df = pd.concat([df.loc[x] for x in columns], axis=1)
df.columns = columns
# df.index = df.index.map(pd.Timestamp)
return df
[docs]
def get_document_frequencies(urns: List[str] = None, cutoff: int = 0, words: List[str] = None) -> DataFrame:
"""Fetch frequency counts of ``words`` in documents (``urns``).
Call the API :py:obj:`~dhlab.constants.BASE_URL` endpoint
`/frequencies`.
:param list urns: list of uniform resource name strings, for example:
``["URN:NBN:no-nb_digibok_2008051404065", "URN:NBN:no-nb_digibok_2010092120011"]``
:param int cutoff: minimum frequency of a word to be counted
:param list words: a list of words to be counted - if left None, whole document is returned. If not None both the counts and their relative frequency is returned.
"""
params = locals()
r = requests.post(f"{BASE_URL}/frequencies", json=params)
result = r.json()
# check if words are passed - return differs a bit
if words is None:
structure = dict()
for u in result:
try:
structure[u[0][0]] = dict([(x[1], x[2]) for x in u])
except IndexError:
pass
df = pd.DataFrame(structure)
df = df.sort_values(by=df.columns[0], ascending=False).fillna(0)
else:
df = pd.DataFrame(result)
df.columns = ["urn", "word", "freq", "urncount"]
df['relfreq'] = df['freq']/df.urncount
df = pd.pivot_table(df, values=["freq",'relfreq'], index="word", columns="urn").fillna(0)
return df
[docs]
def get_word_frequencies(
urns: List[str] = None, cutoff: int = 0, words: List[str] = None
) -> DataFrame:
"""Fetch frequency numbers for ``words`` in documents (``urns``).
Call the API :py:obj:`~dhlab.constants.BASE_URL` endpoint
`/frequencies`.
:param list urns: list of uniform resource name strings, for example:
``["URN:NBN:no-nb_digibok_2008051404065", "URN:NBN:no-nb_digibok_2010092120011"]``
:param int cutoff: minimum frequency of a word to be counted
:param list words: a list of words to be counted - should not be left None.
"""
return get_document_frequencies(urns, cutoff, words)
[docs]
def get_urn_frequencies(
urns: List[str] = None, dhlabid: List = None
) -> DataFrame:
"""Fetch frequency counts of documents as URNs or DH-lab ids.
Call the API :py:obj:`~dhlab.constants.BASE_URL` endpoint
`/frequencies`.
:param list urns: list of uniform resource name strings, for example:
``["URN:NBN:no-nb_digibok_2008051404065", "URN:NBN:no-nb_digibok_2010092120011"]``
:param list dhlabid: list of numbers for dhlabid:
``[1000001, 2000003]``
"""
if dhlabid == None:
params = {'urns': urns}
else:
params = {'dhlabid': dhlabid}
r = requests.post(f"{BASE_URL}/urn_frequencies", json=params)
result = r.json()
# check if words are passed - return differs a bit
df = pd.DataFrame(result)
df.columns = ["urn", "freq"]
return df
[docs]
def get_document_corpus(**kwargs):
return document_corpus(**kwargs)
[docs]
def document_corpus(
doctype: str = None,
author: str = None,
freetext: str = None,
fulltext: str = None,
from_year: int = None,
to_year: int = None,
from_timestamp: int = None,
to_timestamp: int = None,
title: str = None,
ddk: str = None,
subject: str = None,
lang: str = None,
limit: int = None,
order_by: str = None,
) -> DataFrame:
"""Fetch a corpus based on metadata.
Call the API :py:obj:`~dhlab.constants.BASE_URL` endpoint
`/build_corpus <https://api.nb.no/dhlab/#/default/post_build_corpus>`_.
:param str doctype: ``"digibok"``, ``"digavis"``, ``"digitidsskrift"`` or ``"digistorting"``
:param str author: Name of an author.
:param str freetext: any of the parameters, for example: ``"digibok AND Ibsen"``.
:param str fulltext: words within the publication.
:param int from_year: Start year for time period of interest.
:param int to_year: End year for time period of interest.
:param int from_timestamp: Start date for time period of interest.
Format: ``YYYYMMDD``, books have ``YYYY0101``
:param int to_timestamp: End date for time period of interest.
Format: ``YYYYMMDD``, books have ``YYYY0101``
:param str title: Name or title of a document.
:param str ddk: `Dewey Decimal Classification
<https://no.wikipedia.org/wiki/Deweys_desimalklassifikasjon>`_ identifier.
:param str subject: subject (keywords) of the publication.
:param str lang: Language of the publication, as a 3-letter ISO code.
Example: ``"nob"`` or ``"nno"``
:param int limit: number of items to sample.
:param str order_by: order of elements in the corpus object. Typically used in combination with a limit. Example ``"random"`` (random order, the slowest), ``"rank"`` (ordered by relevance, faster) or ``"first"`` (breadth-first, using the order in the database table, the fastest method)
:return: a ``pandas.DataFrame`` with the corpus information.
"""
parms = locals()
params = {x: parms[x] for x in parms if not parms[x] is None}
r = requests.post(BASE_URL + "/build_corpus", json=params)
return pd.DataFrame(r.json())
[docs]
def urn_collocation(
urns: List = None,
word: str = "arbeid",
before: int = 5,
after: int = 0,
samplesize: int = 200000,
) -> DataFrame:
"""Create a collocation from a list of URNs.
Call the API :py:obj:`~dhlab.constants.BASE_URL` endpoint
`/urncolldist_urn`.
:param list urns: list of uniform resource name strings, for example:
``["URN:NBN:no-nb_digibok_2008051404065", "URN:NBN:no-nb_digibok_2010092120011"]``
:param str word: word to construct collocation with.
:param int before: number of words preceding the given ``word``.
:param int after: number of words following the given ``word``.
:param int samplesize: total number of ``urns`` to search through.
:return: a ``pandas.DataFrame`` with distance (sum of distances and bayesian distance) and
frequency for words collocated with ``word``.
"""
params = {
"urn": urns,
"word": word,
"before": before,
"after": after,
"samplesize": samplesize,
}
r = requests.post(BASE_URL + "/urncolldist_urn", json=params)
return pd.read_json(StringIO(r.json()))
[docs]
def totals(top_words: int = 50000) -> DataFrame:
"""Get aggregated raw frequencies of all words in the National Library's database.
Call the API :py:obj:`~dhlab.constants.BASE_URL` endpoint
`/totals/{top_words} <https://api.nb.no/dhlab/#/default/get_totals__top_words_>`_.
:param int top_words: The number of words to get total frequencies for.
:return: a ``pandas.DataFrame`` with the most frequent words.
"""
r = requests.get(BASE_URL + f"/totals/{top_words}")
return pd.DataFrame.from_dict(dict(r.json()), orient="index", columns=["freq"])
[docs]
def concordance(
urns: list = None, words: str = None, window: int = 25, limit: int = 100
) -> DataFrame:
"""Get a list of concordances from the National Library's database.
Call the API :py:obj:`~dhlab.constants.BASE_URL` endpoint
`/conc <https://api.nb.no/dhlab/#/default/post_conc>`_.
:param list urns: uniform resource names, for example:
``["URN:NBN:no-nb_digibok_2008051404065", "URN:NBN:no-nb_digibok_2010092120011"]``
:param str words: Word(s) to search for.
Can be an SQLite fulltext query, an fts5 string search expression.
:param int window: number of tokens on either side to show in the collocations, between 1-25.
:param int limit: max. number of concordances per document. Maximum value is 1000.
:return: a table of concordances
"""
if words is None:
return {} # exit condition
else:
params = {"urns": urns, "query": words, "window": window, "limit": limit}
r = requests.post(BASE_URL + "/conc", json=params)
return pd.DataFrame(r.json())
[docs]
def concordance_counts(
urns: list = None, words: str = None, window: int = 25, limit: int = 100
) -> DataFrame:
"""Count concordances (keyword in context) for a corpus query (used for collocation analysis).
Call the API :py:obj:`~dhlab.constants.BASE_URL` endpoint
`/conccount <https://api.nb.no/dhlab/#/default/post_conccount>`_.
:param list urns: uniform resource names, for example:
``["URN:NBN:no-nb_digibok_2008051404065", "URN:NBN:no-nb_digibok_2010092120011"]``
:param str words: Word(s) to search for.
Can be an SQLite fulltext query, an fts5 string search expression.
:param int window: number of tokens on either side to show in the collocations, between 1-25.
:param int limit: max. number of concordances per document. Maximum value is 1000.
:return: a table of counts
"""
if words is None:
return {} # exit condition
else:
params = {"urns": urns, "query": words, "window": window, "limit": limit}
r = requests.post(BASE_URL + "/conccount", json=params)
return pd.DataFrame(r.json())
[docs]
def konkordans(
urns: list = None, words: str = None, window: int = 25, limit: int = 100
):
"""Wrapper for :func:`concordance`."""
return concordance(**locals())
[docs]
def word_concordance(
urn: list = None, dhlabid: list = None, words: list = None, before: int = 12, after: int = 12, limit: int = 100, samplesize: int = 50000
) -> DataFrame:
"""Get a list of concordances from the National Library's database.
Call the API :py:obj:`~dhlab.constants.BASE_URL` endpoint
`/conc <https://api.nb.no/dhlab/#/default/conc_word_urn>`_.
:param list urns: dhlab serial ids. (server can take both urns and dhlabid but so we may rewrite this to)
:param str words: Word(s) to search for -- must be a list
:param int before: between 0-24.
:param int after: between 0-24 (before + sum <= 24)
:param int limit: max. number of concordances per server process.
:param int samplesize: samples from urns.
:return: a table of concordances
"""
# server checks if either dhlabid or urns are present in the parameters, so only one of them
# is passed. The return is dhlabid anyhow.
if dhlabid is not None:
params = {"dhlabid": dhlabid, "words": words, "before": before, "after": after, "limit": limit, "samplesize":samplesize}
elif urns is not None:
params = {"urn": urns, "words": words, "before": before, "after": after, "limit": limit, "samplesize":samplesize}
else:
params = {"words": words, "before": before, "after": after, "limit": limit, "samplesize":samplesize}
r = requests.post(BASE_URL + "/conc_word_urn", json=params)
return pd.DataFrame([x for y in r.json() for x in y], columns=['dhlabid', 'before','target','after'])
[docs]
def collocation(
corpusquery: str = "norge", word: str = "arbeid", before: int = 5, after: int = 0
) -> DataFrame:
"""Make a collocation from a corpus query.
:param str corpusquery: query string
:param str word: target word for the collocations.
:param int before: number of words prior to ``word``
:param int after: number of words following ``word``
:return: a dataframe with the resulting collocations
"""
params = {
"metadata_query": corpusquery,
"word": word,
"before": before,
"after": after,
}
r = requests.post(BASE_URL + "/urncolldist", json=params)
return pd.read_json(r.json())
# Norwegian word bank
[docs]
def word_variant(word: str, form: str, lang: str = "nob") -> list:
"""Find alternative ``form`` for a given ``word`` form.
Call the API :py:obj:`~dhlab.constants.BASE_URL` endpoint ``/variant_form``
Example: ``word_variant('spiste', 'pres-part')``
:param str word: any word string
:param str form: a morphological feature tag from the Norwegian wordbank
`"Orbanken" <https://www.nb.no/sprakbanken/ressurskatalog/oai-nb-no-sbr-5/>`_.
:param str lang: either "nob" or "nno"
"""
r = requests.get(
f"{BASE_URL}/variant_form", params={"word": word, "form": form, "lang": lang}
)
return r.json()
[docs]
def word_paradigm(word: str, lang: str = "nob") -> list:
"""Find paradigms for a given ``word`` form.
Call the API :py:obj:`~dhlab.constants.BASE_URL` endpoint ``/paradigm``
Example:
.. code-block:: python
word_paradigm('spiste')
# [['adj', ['spisende', 'spist', 'spiste']],
# ['verb', ['spis', 'spise', 'spiser', 'spises', 'spist', 'spiste']]]
:param str word: any word string
:param str lang: either "nob" or "nno"
"""
r = requests.get(f"{BASE_URL}/paradigm", params={"word": word, "lang": lang})
return r.json()
[docs]
def word_paradigm_many(wordlist: list, lang: str = "nob") -> list:
"""Find alternative forms for a list of words."""
r = requests.post(f"{BASE_URL}/paradigms", json={"words": wordlist, "lang": lang})
return r.json()
[docs]
def word_lemma(word: str, lang: str = "nob") -> list:
"""Find the list of possible lemmas for a given ``word`` form."""
r = requests.get(f"{BASE_URL}/word_lemma", params={"word": word, "lang": lang})
return r.json()
[docs]
def word_lemma_many(wordlist, lang="nob"):
"""Find lemmas for a list of given word forms."""
[docs]
def query_imagination_corpus(category=None, author=None, title=None, year=None, publisher=None, place=None, oversatt=None):
"""Fetch data from imagination corpus"""
params = locals()
params = {key: params[key] for key in params if params[key] is not None}
#print(params)
r = requests.get(f"{BASE_URL}/imagination", params=params)
return r.json()