Source code for dhlab.api.dhlab_api

from io import StringIO
from typing import Dict, List, Tuple, Union

import pandas as pd
import requests

# from requests import HTTPError, JSONDecodeError, ConnectionError
from pandas import DataFrame, Series

from dhlab.constants import BASE_URL

pd.options.display.max_rows = 100

# wildcard search for words









# fetch metadata

[docs] def images(text = None, part=True): """ Retrive images from bokhylla :param text: fulltext query expression for sqlite :param part: if a number the whole page is shown ... bug prevents these from going thru :param delta: if part=True then show additional pixels around image :parsm hits: number of images""" params = locals() r = requests.get(f"{BASE_URL}/images", params=params) js = r.json() return js
[docs] def ner_from_urn(urn: str = None, model: str = None, start_page = 0, to_page = 0) -> DataFrame: """Get NER annotations for a text (``urn``) using a spacy ``model``. :param str urn: uniform resource name, example: ``URN:NBN:no-nb_digibok_2011051112001`` :param str model: name of a spacy model. Check which models are available with :func:`show_spacy_models` :return: Dataframe with annotations and their frequencies """ params = locals() r = requests.get(f"{BASE_URL}/ner_urn", params=params) df = pd.read_json(r.json()) return df
[docs] def pos_from_urn(urn: str = None, model: str = None, start_page = 0, to_page = 0) -> DataFrame: """Get part of speech tags and dependency parse annotations for a text (``urn``) with a SpaCy ``model``. :param str urn: uniform resource name, example: ``URN:NBN:no-nb_digibok_2011051112001`` :param str model: name of a spacy model. Check which models are available with :func:`show_spacy_models` :return: Dataframe with annotations and their frequencies """ params = locals() r = requests.get(f"{BASE_URL}/pos_urn", params=params) df = pd.read_json(r.json()) return df
[docs] def show_spacy_models() -> List: """Show available SpaCy model names.""" try: r = requests.get(f"{BASE_URL}/ner_models") #r.raise_for_status() res = r.json() except: #(HTTPError, JSONDecodeError, ConnectionError) as error: #print(error.__doc__, error) print("Server-request gikk ikke gjennom. Kan ikke vise SpaCy-modellnavn.") res = [] return res
[docs] def get_places(urn: str) -> DataFrame: """Look up placenames in a specific URN. Call the API :py:obj:`~dhlab.constants.BASE_URL` endpoint `/places <https://api.nb.no/dhlab/#/default/post_places>`_. :param str urn: uniform resource name, example: ``URN:NBN:no-nb_digibok_2011051112001`` """ params = locals() r = requests.post(f"{BASE_URL}/places", json=params) # print(r.status_code) return pd.DataFrame(r.json())
[docs] def geo_lookup( places: List, feature_class: str = None, feature_code: str = None, field: str = "alternatename", ) -> DataFrame: """From a list of places, return their geolocations :param list places: a list of place names - max 1000 :param str feature_class: which GeoNames feature class to return. Example: ``P`` :param str feature_code: which GeoNames feature code to return. Example: ``PPL`` :param str field: which name field to match - default "alternatename". """ res = requests.post( f"{BASE_URL}/geo_data", json={ "words": places, "feature_class": feature_class, "feature_code": feature_code, "field": field, }, ) columns = [ "geonameid", "name", "alternatename", "latitude", "longitude", "feature_class", "feature_code", ] return pd.DataFrame(res.json(), columns=columns)
[docs] def get_dispersion( urn: str = None, words: List = None, window: int = 300, pr: int = 100, ) -> Series: """Count occurrences of words in the given URN object. Call the API :py:obj:`~dhlab.constants.BASE_URL` endpoint ``/dispersion``. :param str urn: uniform resource name, example: ``URN:NBN:no-nb_digibok_2011051112001`` :param list words: list of words. Defaults to a list of punctuation marks. :param int window: The number of tokens to search through per row. Defaults to 300. :param int pr: defaults to 100. :return: a ``pandas.Series`` with frequency counts of the words in the URN object. """ params = locals() r = requests.post(f"{BASE_URL}/dispersion", json=params) return pd.Series(r.json())
[docs] def get_metadata(urns: List[str] = None) -> DataFrame: """Get metadata for a list of URNs. Calls the API :py:obj:`~dhlab.constants.BASE_URL` endpoint `/get_metadata <https://api.nb.no/dhlab/#/default/post_get_metadata>`_. :param list urns: list of uniform resource name strings, for example: ``["URN:NBN:no-nb_digibok_2008051404065", "URN:NBN:no-nb_digibok_2010092120011"]`` """ params = locals() r = requests.post(f"{BASE_URL}/get_metadata", json=params) return DataFrame(r.json())
[docs] def get_identifiers(identifiers: list = None) -> list: """Convert a list of identifiers, oaiid, sesamid, urns or isbn10 to dhlabids""" res = requests.post(f"{BASE_URL}/identifiers", json={'identifiers':[i for i in identifiers if i != '']}) return res.json()
[docs] def get_chunks(urn: str = None, chunk_size: int = 300) -> Union[Dict, List]: """Get the text in the document ``urn`` as frequencies of chunks of the given ``chunk_size``. Calls the API :py:obj:`~dhlab.constants.BASE_URL` endpoint ``/chunks``. :param str urn: uniform resource name, example: ``URN:NBN:no-nb_digibok_2011051112001`` :param int chunk_size: Number of tokens to include in each chunk. :return: list of dicts with the resulting chunk frequencies, or an empty dict """ if urn is None: return {} r = requests.get(f"{BASE_URL}/chunks", params=locals()) if r.status_code == 200: result = r.json() else: result = {} return result
[docs] def get_chunks_para(urn: str = None) -> Union[Dict, List]: """Fetch chunks and their frequencies from paragraphs in a document (``urn``). Calls the API :py:obj:`~dhlab.constants.BASE_URL` endpoint ``/chunks_para``. :param str urn: uniform resource name, example: ``URN:NBN:no-nb_digibok_2011051112001`` :return: list of dicts with the resulting chunk frequencies, or an empty dict """ if urn is None: return {} r = requests.get(f"{BASE_URL}/chunks_para", params=locals()) if r.status_code == 200: result = r.json() else: result = {} return result
[docs] def evaluate_documents(wordbags: Dict = None, urns: List[str] = None) -> DataFrame: """Count and aggregate occurrences of topic ``wordbags`` for each document in a list of ``urns``. :param dict wordbags: a dictionary of topic keywords and lists of associated words. Example: ``{"natur": ["planter", "skog", "fjell", "fjord"], ... }`` :param list urns: uniform resource names, for example: ``["URN:NBN:no-nb_digibok_2008051404065", "URN:NBN:no-nb_digibok_2010092120011"]`` :return: a ``pandas.DataFrame`` with the topics as columns, indexed by the dhlabids of the documents. """ res = requests.post( f"{BASE_URL}/evaluate", json={"wordbags": wordbags, "urns": urns} ) df = pd.DataFrame(res.json()).transpose() return df
[docs] def get_reference( corpus: str = "digavis", from_year: int = 1950, to_year: int = 1955, lang: str = "nob", limit: int = 100000, ) -> DataFrame: """Reference frequency list of the n most frequent words from a given corpus in a given period. Call the API :py:obj:`~dhlab.constants.BASE_URL` endpoint `/reference_corpus <https://api.nb.no/dhlab/#/default/get_reference_corpus>`_. :param str corpus: Document type to include in the corpus, can be either ``'digibok'`` or ``'digavis'``. :param int from_year: Starting point for time period of the corpus. :param int to_year: Last year of the time period of the corpus. :param str lang: Language of the corpus, can be one of ``'nob,', 'nno,', 'sme,', 'sma,', 'smj', 'fkv'`` :param int limit: Maximum number of most frequent words. :return: A ``pandas.DataFrame`` with the results. """ params = locals() r = requests.get(BASE_URL + "/reference_corpus", params=params) if r.status_code == 200: result = r.json() else: result = [] return pd.DataFrame(result, columns=["word", "freq"]).set_index("word")
[docs] def find_urns(docids: Union[Dict, DataFrame] = None, mode: str = "json") -> DataFrame: """Return a list of URNs from a collection of docids. Call the API :py:obj:`~dhlab.constants.BASE_URL` endpoint `/find_urn`. :param docids: dictionary of document IDs (``{docid: URN}``) or a ``pandas.DataFrame``. :param str mode: Default 'json'. :return: the URNs that were found, in a ``pandas.DataFrame``. """ params = locals() r = requests.post(BASE_URL + "/find_urn", json=params) if r.status_code == 200: res = pd.DataFrame.from_dict(r.json(), orient="index", columns=["urn"]) else: res = pd.DataFrame() return res
def _ngram_doc( doctype: str = None, word: Union[List, str] = ["."], title: str = None, period: Tuple[int, int] = None, publisher: str = None, lang: str = None, city: str = None, ddk: str = None, topic: str = None, ) -> DataFrame: """Count occurrences of one or more words over a time period. The type of document to search through is decided by the ``doctype``. Filter the selection of documents with metadata. Use % as wildcard where appropriate - no wildcards in ``word`` or ``lang``. :param str doctype: API endpoint for the document type to get ngrams for. Can be ``'book'``, ``'periodicals'``, or ``'newspapers'``. :param word: Word(s) to search for. Can be several words in a single string, separated by comma, e.g. ``"ord,ordene,orda"``. :type word: str or list of str :param str title: Title of a specific document to search through. :param period: Start and end years or dates of a time period, given as ``(YYYY, YYYY)`` or ``(YYYYMMDD, YYYYMMDD)``. :type period: tuple of ints :param str publisher: Name of a publisher. :param str lang: Language as a 3-letter ISO code (e.g. ``"nob"`` or ``"nno"``) :param str city: City of publication. :param str ddk: `Dewey Decimal Classification <https://no.wikipedia.org/wiki/Deweys_desimalklassifikasjon>`_ identifier. :param str topic: Topic of the documents. :return: a ``pandas.DataFrame`` with the resulting frequency counts of the word(s), spread across years. One year per row. """ params = locals() if isinstance(word, str): # assume a comma separated string word = [w.strip() for w in word.split(",")] params["word"] = tuple(word) params = {x: params[x] for x in params if not params[x] is None} r = requests.post(BASE_URL + "/ngram_" + doctype, json=params) # print(r.status_code) df = pd.DataFrame.from_dict(r.json(), orient="index") df.index = df.index.map(lambda x: tuple(x.split())) columns = df.index.levels[0] df = pd.concat([df.loc[x] for x in columns], axis=1) df.columns = columns # df.index = df.index.map(pd.Timestamp) return df
[docs] def reference_words( words: List = None, doctype: str = "digibok", from_year: Union[str, int] = 1800, to_year: Union[str, int] = 2000, ) -> DataFrame: """Collect reference data for a list of words over a time period. Reference data are the absolute and relative frequencies of the ``words`` across all documents of the given ``doctype`` in the given time period (``from_year`` - ``to_year``). :param list words: list of word strings. :param str doctype: type of reference document. Can be ``"digibok"`` or ``"digavis"``. Defaults to ``"digibok"``. .. note:: If any other string is given as the ``doctype``, the resulting data is equivalent to what you get with ``doctype="digavis"``. :param int from_year: first year of publication :param int to_year: last year of publication :return: a DataFrame with the words' frequency data """ params = locals() r = requests.post(f"{BASE_URL}/reference_words", json=params) print(r.status_code, BASE_URL) if r.status_code == 200: res = pd.DataFrame(r.json(), columns=["word", "freq", "relative"]) else: res = pd.DataFrame() return res
def _ngram_doc( doctype: str = None, word: Union[List, str] = ['.'], title: str = None, period: Tuple[int, int] = None, publisher: str = None, lang: str = None, city: str = None, ddk: str = None, topic: str = None ) -> pd.DataFrame: """Count occurrences of one or more words over a time period. The type of document to search through is decided by the ``endpoint``. Filter the selection of documents with metadata. Use % as wildcard where appropriate - no wildcards in ``word`` or ``lang``. :param str doctype: API endpoint for the document type to get ngrams for. Can be ``'book'``, ``'periodicals'``, or ``'newspapers'``. :param word: Word(s) to search for. Can be several words in a single string, separated by comma. :type word: str or list of str :param title: Title of a specific document to search through. :param tuple[int,int] period: Start and end years of a time period, given as ``(start year, end year)``. :param str publisher: Name of a publisher. :param str lang: Language as a 3-letter ISO code (e.g. ``"nob"`` or ``"nno"``) :param str city: City of publication. :param str ddk: `Dewey Decimal Classification <https://no.wikipedia.org/wiki/Deweys_desimalklassifikasjon>`_ identifier. :param str topic: Topic of the documents. :return: a `pandas.DataFrame` with the resulting frequency counts of the word(s), spread across years. One year per row. """ params = locals() if isinstance(word, str): # assume a comma separated string word = [w.strip() for w in word.split(',')] params['word'] = tuple(word) params = {x: params[x] for x in params if not params[x] is None} r = requests.post(BASE_URL + "/ngram_" + doctype, json=params) # print(r.status_code) df = pd.DataFrame.from_dict(r.json(), orient='index') df.index = df.index.map(lambda x: tuple(x.split())) columns = df.index.levels[0] df = pd.concat([df.loc[x] for x in columns], axis=1) df.columns = columns # df.index = df.index.map(pd.Timestamp) return df # @_docstring_parameters_from(_ngram_doc, drop="doctype")
[docs] def ngram_book( word: Union[List, str] = ["."], title: str = None, period: Tuple[int, int] = None, publisher: str = None, lang: str = None, city: str = None, ddk: str = None, topic: str = None, ) -> DataFrame: """Count occurrences of one or more words in books over a given time period. Call the API :py:obj:`~dhlab.constants.BASE_URL` endpoint `/ngram_book`. Filter the selection of books with metadata. Use % as wildcard where appropriate - no wildcards in ``word`` or ``lang``. :param word: Word(s) to search for. Can be several words in a single string, separated by comma, e.g. ``"ord,ordene,orda"``. :type word: str or list of str :param str title: Title of a specific document to search through. :param period: Start and end years or dates of a time period, given as ``(YYYY, YYYY)`` or ``(YYYYMMDD, YYYYMMDD)``. :type period: tuple of ints :param str publisher: Name of a publisher. :param str lang: Language as a 3-letter ISO code (e.g. ``"nob"`` or ``"nno"``) :param str city: City of publication. :param str ddk: `Dewey Decimal Classification <https://no.wikipedia.org/wiki/Deweys_desimalklassifikasjon>`_ identifier. :param str topic: Topic of the documents. :return: a ``pandas.DataFrame`` with the resulting frequency counts of the word(s), spread across years. One year per row. """ params = locals() if isinstance(word, str): # assume a comma separated string word = [w.strip() for w in word.split(',')] params['word'] = tuple(word) params = {x: params[x] for x in params if not params[x] is None} r = requests.post(BASE_URL + "/ngram_book", json=params) # print(r.status_code) df = pd.DataFrame.from_dict(r.json(), orient='index') df.index = df.index.map(lambda x: tuple(x.split())) columns = df.index.levels[0] df = pd.concat([df.loc[x] for x in columns], axis=1) df.columns = columns # df.index = df.index.map(pd.Timestamp) return df
# @_docstring_parameters_from(_ngram_doc, drop="doctype")
[docs] def ngram_periodicals( word: Union[List, str] = ["."], title: str = None, period: Tuple[int, int] = None, publisher: str = None, lang: str = None, city: str = None, ddk: str = None, topic: str = None, **kwargs, ) -> DataFrame: """Get a time series of frequency counts for ``word`` in periodicals. Call the API :py:obj:`~dhlab.constants.BASE_URL` endpoint `/ngram_periodicals`. :param word: Word(s) to search for. Can be several words in a single string, separated by comma, e.g. ``"ord,ordene,orda"``. :type word: str or list of str :param str title: Title of a specific document to search through. :param period: Start and end years or dates of a time period, given as ``(YYYY, YYYY)`` or ``(YYYYMMDD, YYYYMMDD)``. :type period: tuple of ints :param str publisher: Name of a publisher. :param str lang: Language as a 3-letter ISO code (e.g. ``"nob"`` or ``"nno"``) :param str city: City of publication. :param str ddk: `Dewey Decimal Classification <https://no.wikipedia.org/wiki/Deweys_desimalklassifikasjon>`_ identifier. :param str topic: Topic of the documents. :return: a ``pandas.DataFrame`` with the resulting frequency counts of the word(s), spread across years. One year per row. """ params = locals() if isinstance(word, str): # assume a comma separated string word = [w.strip() for w in word.split(',')] params['word'] = tuple(word) params = {x: params[x] for x in params if not params[x] is None} r = requests.post(BASE_URL + "/ngram_periodicals", json=params) # print(r.status_code) df = pd.DataFrame.from_dict(r.json(), orient='index') df.index = df.index.map(lambda x: tuple(x.split())) columns = df.index.levels[0] df = pd.concat([df.loc[x] for x in columns], axis=1) df.columns = columns # df.index = df.index.map(pd.Timestamp) return df
[docs] def ngram_news( word: Union[List, str] = ["."], title: str = None, period: Tuple[int, int] = None, ) -> DataFrame: """Get a time series of frequency counts for ``word`` in newspapers. Call the API :py:obj:`~dhlab.constants.BASE_URL` endpoint `/ngram_newspapers`. :param word: Word(s) to search for. Can be several words in a single string, separated by comma, e.g. ``"ord,ordene,orda"``. :type word: str or list of str :param str title: Title of a specific newspaper to search through. :param period: Start and end years or dates of a time period, given as ``(YYYY, YYYY)`` or ``(YYYYMMDD, YYYYMMDD)``. :type period: tuple of ints :return: a ``pandas.DataFrame`` with the resulting frequency counts of the word(s), spread across the dates given in the time period. Either one year or one day per row. """ params = locals() if isinstance(word, str): # assume a comma separated string word = [w.strip() for w in word.split(',')] params['word'] = tuple(word) params = {x: params[x] for x in params if not params[x] is None} r = requests.post(BASE_URL + "/ngram_newspapers", json=params) # print(r.status_code) df = pd.DataFrame.from_dict(r.json(), orient='index') df.index = df.index.map(lambda x: tuple(x.split())) columns = df.index.levels[0] df = pd.concat([df.loc[x] for x in columns], axis=1) df.columns = columns # df.index = df.index.map(pd.Timestamp) return df
[docs] def get_document_frequencies(urns: List[str] = None, cutoff: int = 0, words: List[str] = None) -> DataFrame: """Fetch frequency counts of ``words`` in documents (``urns``). Call the API :py:obj:`~dhlab.constants.BASE_URL` endpoint `/frequencies`. :param list urns: list of uniform resource name strings, for example: ``["URN:NBN:no-nb_digibok_2008051404065", "URN:NBN:no-nb_digibok_2010092120011"]`` :param int cutoff: minimum frequency of a word to be counted :param list words: a list of words to be counted - if left None, whole document is returned. If not None both the counts and their relative frequency is returned. """ params = locals() r = requests.post(f"{BASE_URL}/frequencies", json=params) result = r.json() # check if words are passed - return differs a bit if words is None: structure = dict() for u in result: try: structure[u[0][0]] = dict([(x[1], x[2]) for x in u]) except IndexError: pass df = pd.DataFrame(structure) df = df.sort_values(by=df.columns[0], ascending=False).fillna(0) else: df = pd.DataFrame(result) df.columns = ["urn", "word", "freq", "urncount"] df['relfreq'] = df['freq']/df.urncount df = pd.pivot_table(df, values=["freq",'relfreq'], index="word", columns="urn").fillna(0) return df
[docs] def get_word_frequencies( urns: List[str] = None, cutoff: int = 0, words: List[str] = None ) -> DataFrame: """Fetch frequency numbers for ``words`` in documents (``urns``). Call the API :py:obj:`~dhlab.constants.BASE_URL` endpoint `/frequencies`. :param list urns: list of uniform resource name strings, for example: ``["URN:NBN:no-nb_digibok_2008051404065", "URN:NBN:no-nb_digibok_2010092120011"]`` :param int cutoff: minimum frequency of a word to be counted :param list words: a list of words to be counted - should not be left None. """ return get_document_frequencies(urns, cutoff, words)
[docs] def get_urn_frequencies( urns: List[str] = None, dhlabid: List = None ) -> DataFrame: """Fetch frequency counts of documents as URNs or DH-lab ids. Call the API :py:obj:`~dhlab.constants.BASE_URL` endpoint `/frequencies`. :param list urns: list of uniform resource name strings, for example: ``["URN:NBN:no-nb_digibok_2008051404065", "URN:NBN:no-nb_digibok_2010092120011"]`` :param list dhlabid: list of numbers for dhlabid: ``[1000001, 2000003]`` """ if dhlabid == None: params = {'urns': urns} else: params = {'dhlabid': dhlabid} r = requests.post(f"{BASE_URL}/urn_frequencies", json=params) result = r.json() # check if words are passed - return differs a bit df = pd.DataFrame(result) df.columns = ["urn", "freq"] return df
[docs] def get_document_corpus(**kwargs): return document_corpus(**kwargs)
[docs] def document_corpus( doctype: str = None, author: str = None, freetext: str = None, fulltext: str = None, from_year: int = None, to_year: int = None, from_timestamp: int = None, to_timestamp: int = None, title: str = None, ddk: str = None, subject: str = None, lang: str = None, limit: int = None, order_by: str = None, ) -> DataFrame: """Fetch a corpus based on metadata. Call the API :py:obj:`~dhlab.constants.BASE_URL` endpoint `/build_corpus <https://api.nb.no/dhlab/#/default/post_build_corpus>`_. :param str doctype: ``"digibok"``, ``"digavis"``, ``"digitidsskrift"`` or ``"digistorting"`` :param str author: Name of an author. :param str freetext: any of the parameters, for example: ``"digibok AND Ibsen"``. :param str fulltext: words within the publication. :param int from_year: Start year for time period of interest. :param int to_year: End year for time period of interest. :param int from_timestamp: Start date for time period of interest. Format: ``YYYYMMDD``, books have ``YYYY0101`` :param int to_timestamp: End date for time period of interest. Format: ``YYYYMMDD``, books have ``YYYY0101`` :param str title: Name or title of a document. :param str ddk: `Dewey Decimal Classification <https://no.wikipedia.org/wiki/Deweys_desimalklassifikasjon>`_ identifier. :param str subject: subject (keywords) of the publication. :param str lang: Language of the publication, as a 3-letter ISO code. Example: ``"nob"`` or ``"nno"`` :param int limit: number of items to sample. :param str order_by: order of elements in the corpus object. Typically used in combination with a limit. Example ``"random"`` (random order, the slowest), ``"rank"`` (ordered by relevance, faster) or ``"first"`` (breadth-first, using the order in the database table, the fastest method) :return: a ``pandas.DataFrame`` with the corpus information. """ parms = locals() params = {x: parms[x] for x in parms if not parms[x] is None} r = requests.post(BASE_URL + "/build_corpus", json=params) return pd.DataFrame(r.json())
[docs] def urn_collocation( urns: List = None, word: str = "arbeid", before: int = 5, after: int = 0, samplesize: int = 200000, ) -> DataFrame: """Create a collocation from a list of URNs. Call the API :py:obj:`~dhlab.constants.BASE_URL` endpoint `/urncolldist_urn`. :param list urns: list of uniform resource name strings, for example: ``["URN:NBN:no-nb_digibok_2008051404065", "URN:NBN:no-nb_digibok_2010092120011"]`` :param str word: word to construct collocation with. :param int before: number of words preceding the given ``word``. :param int after: number of words following the given ``word``. :param int samplesize: total number of ``urns`` to search through. :return: a ``pandas.DataFrame`` with distance (sum of distances and bayesian distance) and frequency for words collocated with ``word``. """ params = { "urn": urns, "word": word, "before": before, "after": after, "samplesize": samplesize, } r = requests.post(BASE_URL + "/urncolldist_urn", json=params) return pd.read_json(StringIO(r.json()))
[docs] def totals(top_words: int = 50000) -> DataFrame: """Get aggregated raw frequencies of all words in the National Library's database. Call the API :py:obj:`~dhlab.constants.BASE_URL` endpoint `/totals/{top_words} <https://api.nb.no/dhlab/#/default/get_totals__top_words_>`_. :param int top_words: The number of words to get total frequencies for. :return: a ``pandas.DataFrame`` with the most frequent words. """ r = requests.get(BASE_URL + f"/totals/{top_words}") return pd.DataFrame.from_dict(dict(r.json()), orient="index", columns=["freq"])
[docs] def concordance( urns: list = None, words: str = None, window: int = 25, limit: int = 100 ) -> DataFrame: """Get a list of concordances from the National Library's database. Call the API :py:obj:`~dhlab.constants.BASE_URL` endpoint `/conc <https://api.nb.no/dhlab/#/default/post_conc>`_. :param list urns: uniform resource names, for example: ``["URN:NBN:no-nb_digibok_2008051404065", "URN:NBN:no-nb_digibok_2010092120011"]`` :param str words: Word(s) to search for. Can be an SQLite fulltext query, an fts5 string search expression. :param int window: number of tokens on either side to show in the collocations, between 1-25. :param int limit: max. number of concordances per document. Maximum value is 1000. :return: a table of concordances """ if words is None: return {} # exit condition else: params = {"urns": urns, "query": words, "window": window, "limit": limit} r = requests.post(BASE_URL + "/conc", json=params) return pd.DataFrame(r.json())
[docs] def concordance_counts( urns: list = None, words: str = None, window: int = 25, limit: int = 100 ) -> DataFrame: """Count concordances (keyword in context) for a corpus query (used for collocation analysis). Call the API :py:obj:`~dhlab.constants.BASE_URL` endpoint `/conccount <https://api.nb.no/dhlab/#/default/post_conccount>`_. :param list urns: uniform resource names, for example: ``["URN:NBN:no-nb_digibok_2008051404065", "URN:NBN:no-nb_digibok_2010092120011"]`` :param str words: Word(s) to search for. Can be an SQLite fulltext query, an fts5 string search expression. :param int window: number of tokens on either side to show in the collocations, between 1-25. :param int limit: max. number of concordances per document. Maximum value is 1000. :return: a table of counts """ if words is None: return {} # exit condition else: params = {"urns": urns, "query": words, "window": window, "limit": limit} r = requests.post(BASE_URL + "/conccount", json=params) return pd.DataFrame(r.json())
[docs] def konkordans( urns: list = None, words: str = None, window: int = 25, limit: int = 100 ): """Wrapper for :func:`concordance`.""" return concordance(**locals())
[docs] def word_concordance( urn: list = None, dhlabid: list = None, words: list = None, before: int = 12, after: int = 12, limit: int = 100, samplesize: int = 50000 ) -> DataFrame: """Get a list of concordances from the National Library's database. Call the API :py:obj:`~dhlab.constants.BASE_URL` endpoint `/conc <https://api.nb.no/dhlab/#/default/conc_word_urn>`_. :param list urns: dhlab serial ids. (server can take both urns and dhlabid but so we may rewrite this to) :param str words: Word(s) to search for -- must be a list :param int before: between 0-24. :param int after: between 0-24 (before + sum <= 24) :param int limit: max. number of concordances per server process. :param int samplesize: samples from urns. :return: a table of concordances """ # server checks if either dhlabid or urns are present in the parameters, so only one of them # is passed. The return is dhlabid anyhow. if dhlabid is not None: params = {"dhlabid": dhlabid, "words": words, "before": before, "after": after, "limit": limit, "samplesize":samplesize} elif urns is not None: params = {"urn": urns, "words": words, "before": before, "after": after, "limit": limit, "samplesize":samplesize} else: params = {"words": words, "before": before, "after": after, "limit": limit, "samplesize":samplesize} r = requests.post(BASE_URL + "/conc_word_urn", json=params) return pd.DataFrame([x for y in r.json() for x in y], columns=['dhlabid', 'before','target','after'])
[docs] def collocation( corpusquery: str = "norge", word: str = "arbeid", before: int = 5, after: int = 0 ) -> DataFrame: """Make a collocation from a corpus query. :param str corpusquery: query string :param str word: target word for the collocations. :param int before: number of words prior to ``word`` :param int after: number of words following ``word`` :return: a dataframe with the resulting collocations """ params = { "metadata_query": corpusquery, "word": word, "before": before, "after": after, } r = requests.post(BASE_URL + "/urncolldist", json=params) return pd.read_json(r.json())
# Norwegian word bank
[docs] def word_variant(word: str, form: str, lang: str = "nob") -> list: """Find alternative ``form`` for a given ``word`` form. Call the API :py:obj:`~dhlab.constants.BASE_URL` endpoint ``/variant_form`` Example: ``word_variant('spiste', 'pres-part')`` :param str word: any word string :param str form: a morphological feature tag from the Norwegian wordbank `"Orbanken" <https://www.nb.no/sprakbanken/ressurskatalog/oai-nb-no-sbr-5/>`_. :param str lang: either "nob" or "nno" """ r = requests.get( f"{BASE_URL}/variant_form", params={"word": word, "form": form, "lang": lang} ) return r.json()
[docs] def word_paradigm(word: str, lang: str = "nob") -> list: """Find paradigms for a given ``word`` form. Call the API :py:obj:`~dhlab.constants.BASE_URL` endpoint ``/paradigm`` Example: .. code-block:: python word_paradigm('spiste') # [['adj', ['spisende', 'spist', 'spiste']], # ['verb', ['spis', 'spise', 'spiser', 'spises', 'spist', 'spiste']]] :param str word: any word string :param str lang: either "nob" or "nno" """ r = requests.get(f"{BASE_URL}/paradigm", params={"word": word, "lang": lang}) return r.json()
[docs] def word_paradigm_many(wordlist: list, lang: str = "nob") -> list: """Find alternative forms for a list of words.""" r = requests.post(f"{BASE_URL}/paradigms", json={"words": wordlist, "lang": lang}) return r.json()
[docs] def word_form(word: str, lang: str = "nob") -> list: """Look up the morphological feature specification of a ``word`` form.""" r = requests.get(f"{BASE_URL}/word_form", params={"word": word, "lang": lang}) return r.json()
[docs] def word_form_many(wordlist: list, lang: str = "nob") -> list: """Look up the morphological feature specifications for word forms in a ``wordlist``.""" r = requests.post(f"{BASE_URL}/word_forms", json={"words": wordlist, "lang": lang}) return r.json()
[docs] def word_lemma(word: str, lang: str = "nob") -> list: """Find the list of possible lemmas for a given ``word`` form.""" r = requests.get(f"{BASE_URL}/word_lemma", params={"word": word, "lang": lang}) return r.json()
[docs] def word_lemma_many(wordlist, lang="nob"): """Find lemmas for a list of given word forms."""
[docs] def query_imagination_corpus(category=None, author=None, title=None, year=None, publisher=None, place=None, oversatt=None): """Fetch data from imagination corpus""" params = locals() params = {key: params[key] for key in params if params[key] is not None} #print(params) r = requests.get(f"{BASE_URL}/imagination", params=params) return r.json()