Source code for dhlab.text.corpus

from typing import List, Union

import pandas as pd
from pandas import DataFrame

# import dhlab as dh
# from dhlab.text.conc_coll import Concordance, Collocations, Counts
import dhlab.text.conc_coll as dh
from dhlab.api.dhlab_api import document_corpus, evaluate_documents, get_metadata
from dhlab.text.dhlab_object import DhlabObj
from dhlab.text.utils import urnlist


[docs] class Corpus(DhlabObj): """Class representing as DHLAB Corpus Primary object for working with dhlab data. Contains references to texts in National Library's collections and metadata about them. Use with `.coll`, `.conc` or `.freq` to analyse using dhlab tools. """ doctypes = [ "digibok", "digavis", "digitidsskrift", "digistorting", "digimanus", "kudos", ] def __init__( self, doctype=None, author=None, freetext=None, fulltext=None, from_year=None, to_year=None, from_timestamp=None, to_timestamp=None, title=None, ddk=None, subject=None, lang=None, limit=10, order_by="random", ): """Create Corpus :param str doctype: ``"digibok"``, ``"digavis"``, \ ``"digitidsskrift"`` or ``"digistorting"`` :param str author: Name of an author. :param str freetext: any of the parameters, for example:\ ``"digibok AND Ibsen"``. :param str fulltext: words within the publication. :param int from_year: Start year for time period of interest. :param int to_year: End year for time period of interest. :param int from_timestamp: Start date for time period of interest. Format: ``YYYYMMDD``, books have ``YYYY0101`` :param int to_timestamp: End date for time period of interest. Format: ``YYYYMMDD``, books have ``YYYY0101`` :param str title: Name or title of a document. :param str ddk: `Dewey Decimal Classification \ <https://no.wikipedia.org/wiki/Deweys_desimalklassifikasjon>`\ _ identifier. :param str subject: subject (keywords) of the publication. :param str lang: Language of the publication, as a 3-letter ISO code. Example: ``"nob"`` or ``"nno"`` :param int limit: number of items to sample. """ if ( doctype or author or freetext or fulltext or from_year or to_year or from_timestamp or to_timestamp or title or ddk or lang ): self.corpus = document_corpus( doctype, author, freetext, fulltext, from_year, to_year, from_timestamp, to_timestamp, title, ddk, subject, lang, limit, order_by, ) else: self.corpus = pd.DataFrame(columns=["urn"]) super().__init__(self.corpus) # self.size = len(self.corpus) self.frame.rename( columns={ "urn": "urn", "authors": "author", "langs": "language", "genres": "genre", }, inplace=True, )
[docs] @classmethod def from_identifiers(cls, identifiers: List[Union[str, int]]): """Construct Corpus from list of identifiers""" corpus = Corpus() corpus.extend_from_identifiers(identifiers=identifiers) return corpus
[docs] @classmethod def from_df(cls, df: DataFrame, check_for_urn: bool = False): """Typecast Pandas DataFrame to Corpus class DataFrame most contain URN column""" # If Series, return as is if isinstance(df, pd.Series): return df df = df.copy() # Avoid modifying original DataFrame corpus = Corpus() if check_for_urn: corpus.corpus = cls._urn_id_in_dataframe_cols(df) else: corpus.corpus = df corpus.frame = corpus.corpus return corpus
[docs] @classmethod def from_csv(cls, path: str): """Import corpus from csv""" df = pd.read_csv(path) return cls.from_df(df)
@staticmethod def _urn_id_in_dataframe_cols(dataframe: Union[DataFrame, type("Corpus")]) -> DataFrame: """Checks if dataframe contains URN column""" if "urn" in dataframe.columns: if dataframe.urn.str.contains("^URN:NBN:no-nb_.+").all(): return dataframe raise ValueError("No'urn'-column in dataframe.")
[docs] def extend_from_identifiers(self, identifiers: list = None): new_corpus = get_metadata(urnlist(identifiers)) self.add(new_corpus)
[docs] def evaluate_words(self, wordbags=None): df = evaluate_documents(wordbags=wordbags, urns=list(self.corpus.urn)) df.index = df.index.astype(int) cols = df.columns df = pd.concat( [df, self.corpus[["dhlabid", "urn"]].set_index("dhlabid")], axis=1 ) df = df.set_index("urn") return df[cols].fillna(0)
[docs] def add(self, new_corpus: Union[DataFrame, type("Corpus")]): """Utility for appending Corpus or DataFrame to self""" if self._is_Corpus(new_corpus): new_corpus = new_corpus.frame self.frame = ( pd.concat([self.frame, new_corpus]).drop_duplicates().reset_index(drop=True) ) self.corpus = self.frame
# self.size = len(self.frame)
[docs] def sample(self, n: int = 5): """Create random subkorpus with `n` entries""" n = min(n, self.size) sample = self.corpus.sample(n).copy() return self.from_df(sample)
[docs] def only_one_author(self): """Only select items with one author""" mask = self.frame.author.apply(lambda x: len(x.split('/'))) == 1 return self.from_df(self.frame[mask])
[docs] def only_one_language(self): """Only select items with one language""" mask = self.frame.language.apply(lambda x: len(x.split('/'))) == 1 return self.from_df(self.frame[mask])
[docs] def conc(self, words, window: int = 20, limit: int = 500) -> dh.Concordance: """Get concodances of `words` in corpus""" return dh.Concordance( corpus=self.frame, query=words, window=window, limit=limit )
[docs] def coll( self, words=None, before=10, after=10, reference=None, samplesize=20000, alpha=False, ignore_caps=False, ) -> dh.Collocations: """Get collocations of `words` in corpus""" return dh.Collocations( corpus=self.frame, words=words, before=before, after=after, reference=reference, samplesize=samplesize, alpha=alpha, ignore_caps=ignore_caps, )
[docs] def count(self, words=None): """Get word frequencies for corpus""" return dh.Counts(self, words)
[docs] def freq(self, words=None): """Get word frequencies for corpus""" return dh.Counts(self, words)
@staticmethod def _is_Corpus(corpus: "Corpus") -> bool: """Check if `input` is Corpus or DataFrame""" if type(corpus) not in [DataFrame, Corpus]: raise TypeError("Input is not Corpus or DataFrame") return isinstance(corpus, Corpus) def __add__(self, other): """Add two Corpus objects""" if not self._is_Corpus(other): raise TypeError("Input is not Corpus or DataFrame") return self.from_df( pd.concat([self.frame, other.frame]) .drop_duplicates() .reset_index(drop=True) ) def _make_subcorpus(self, **kwargs) -> "Corpus": dct = kwargs.copy() year_range = dct.pop("year_range", None) for key in dct.keys(): if key not in self.frame.columns: print(f"Key {key} not in corpus") return None # Make result dataframe res = self.frame.copy() # Get year range if year_range is not None: y1 = int(year_range[0]) y2 = int(year_range[1]) # Apply year range res = res.loc[res["year"] >= y1].loc[res["year"] <= y2] for key, val in dct.items(): res = res.loc[res[key].str.contains(val)] return self.from_df(res)
[docs] def make_subcorpus(self, authors: str = None, title: str = None) -> "Corpus": """Make subcorpus based on author and title Args: authors (str, optional): search for author field. Defaults to None. title (str, optional): search title field. Defaults to None. Returns: Corpus: A subset of the original corpus """ dct = {} if authors is not None: dct["author"] = authors if title is not None: dct["title"] = title return self._make_subcorpus(**dct)
[docs] def check_integrity(self): """Check the integrity of the corpus data.""" def test_dhlabid_series(series: pd.Series) -> bool: """Check if dhlabid series is valid""" if not series.apply(lambda x: isinstance(x, int)).all(): return False if not ((series >= 1e8) & (series < 1e9)).all(): return False return True def test_urn_series(series: pd.Series) -> bool: """Check if URN series is valid""" if series.str.startswith("URN:NBN:no-nb_").all(): return True try: series = series.apply(lambda x: int(x)) return test_dhlabid_series(series) except: return False # Check if the DataFrame is empty if self.corpus.empty: raise ValueError("Corpus is empty.") # Check for the presence of essential columns required_columns = ["urn", "dhlabid", "author", "language", "genre"] for col in required_columns: if col not in self.corpus.columns: raise ValueError(f"Essential column '{col}' is missing.") # Validate dhlabid format if not test_dhlabid_series(self.corpus["dhlabid"]): raise ValueError("Some dhlabid values are in an incorrect format.") # Validate URN format if not test_urn_series(self.corpus["urn"]): raise ValueError("Some URN values are in an incorrect format.") return True