Source code for dhlab.api.nb_search_api

from typing import Iterable

import pandas as pd
import requests


[docs] def load_picture(url: str): """Load the raw image object from a URL.""" r = requests.get(url, stream=True) r.raw.decode_content = True # print(r.status_code) return r.raw
[docs] def iiif_manifest(urn: str): """Fetch the IIIF manifest of the images that the book """ r = requests.get(f"https://api.nb.no/catalog/v1/iiif/{urn}/manifest") return r.json()
[docs] def mods(urn: str): r = requests.get(f"https://api.nb.no:443/catalog/v1/metadata/{urn}/mods") return r.json()
[docs] def get_df(phrases: Iterable, title: str = 'aftenposten'): """Get ``phrases`` from documents of ``title``, and aggregate their frequencies.""" querystring = " + ".join(['"' + frase + '"' for frase in phrases]) query = { 'q': querystring, 'size': 1, 'aggs': 'year', # 'filter':'mediatype:{mt}'.format(mt=media), 'filter': f'title:{title}' } r = requests.get("https://api.nb.no/catalog/v1/items", params=query) aggs = r.json()['_embedded']['aggregations'][0]['buckets'] return {x['key']: x['count'] for x in aggs}
[docs] def get_json(phrases, mediatype='aviser'): querystring = " + ".join(['"' + frase + '"' for frase in phrases]) query = { 'q': querystring, 'size': 1, 'snippets': mediatype, 'aggs': 'year', # 'filter':'mediatype:{mt}'.format(mt=mediatype), 'searchType': 'FULL_TEXT_SEARCH' # 'filter':'title:{title}'.format(title=title) } r = requests.get("https://api.nb.no/catalog/v1/items", params=query) aggs = r.json() return aggs
[docs] def get_data(frase, media='avis', title='jazznytt'): query = { 'q': '"' + frase + '""', 'size': 1, 'aggs': 'year', 'filter': 'mediatype:{mt}'.format(mt=media), 'filter': 'title:{title}'.format(title=title) } r = requests.get("https://api.nb.no/catalog/v1/items", params=query) return r.json()
[docs] def get_data_and(frases, title='aftenposten', media='avis'): querystring = " + ".join(['"' + frase + '"' for frase in frases]) print(querystring) query = { 'q': querystring, 'size': 1, 'aggs': 'year', # 'filter':'mediatype:{mt}'.format(mt=media), 'filter': 'title:{title}'.format(title=title) } r = requests.get("https://api.nb.no/catalog/v1/items", params=query) return r.json()
[docs] def get_df_pd(frase, media='bøker'): return pd.DataFrame.from_dict( get_df(frase, media=media), orient='index').sort_index()
[docs] def get_konks(urn, phrase, window=1000, n=1000): querystring = '"' + phrase + '"' query = { 'q': querystring, 'fragments': n, 'fragSize': window } r = requests.get( "https://api.nb.no/catalog/v1/items/{urn}/contentfragments".format(urn=urn), params=query) res = r.json() results = [] try: for x in res['contentFragments']: urn = x['pageid'] hit = x['text'] splits = hit.split('<em>') s2 = splits[1].split('</em>') before = splits[0] word = s2[0] after = s2[1] results.append({'urn': urn, 'before': before, 'word': word, 'after': after}) except BaseException: results = [] return results
[docs] def get_phrase_info(urn, phrase, window=1000, n=1000): querystring = '"' + phrase + '"' query = { 'q': querystring, } r = requests.get( f"https://api.nb.no/catalog/v1/items/{urn}/contentfragments", params=query) res = r.json() return res