Source code for dhlab.text.corpus

from typing import List, Union

import pandas as pd
from pandas import DataFrame

# import dhlab as dh
# from dhlab.text.conc_coll import Concordance, Collocations, Counts
import dhlab.text.conc_coll as dh
from dhlab.api.dhlab_api import document_corpus, evaluate_documents, get_metadata
from dhlab.text.dhlab_object import DhlabObj
from dhlab.text.utils import urnlist



[docs]
class Corpus(DhlabObj):
    """Class representing as DHLAB Corpus

    Primary object for working with dhlab data. Contains references to texts
    in National Library's collections and metadata about them.
    Use with `.coll`, `.conc` or `.freq` to analyse using dhlab tools.
    """

    doctypes = [
        "digibok",
        "digavis",
        "digitidsskrift",
        "digistorting",
        "digimanus",
        "kudos",
    ]

    def __init__(
        self,
        doctype=None,
        author=None,
        freetext=None,
        fulltext=None,
        from_year=None,
        to_year=None,
        from_timestamp=None,
        to_timestamp=None,
        title=None,
        ddk=None,
        subject=None,
        lang=None,
        limit=10,
        order_by="random",
    ):
        """Create Corpus

        :param str doctype: ``"digibok"``, ``"digavis"``, \
            ``"digitidsskrift"`` or ``"digistorting"``
        :param str author: Name of an author.
        :param str freetext: any of the parameters, for example:\
            ``"digibok AND Ibsen"``.
        :param str fulltext: words within the publication.
        :param int from_year: Start year for time period of interest.
        :param int to_year: End year for time period of interest.
        :param int from_timestamp: Start date for time period of interest.
            Format: ``YYYYMMDD``, books have ``YYYY0101``
        :param int to_timestamp: End date for time period of interest.
            Format: ``YYYYMMDD``, books have ``YYYY0101``
        :param str title: Name or title of a document.
        :param str ddk: `Dewey Decimal Classification \
            <https://no.wikipedia.org/wiki/Deweys_desimalklassifikasjon>`\
                _ identifier.
        :param str subject: subject (keywords) of the publication.
        :param str lang: Language of the publication, as a 3-letter ISO code.
            Example: ``"nob"`` or ``"nno"``
        :param int limit: number of items to sample.
        """

        if (
            doctype
            or author
            or freetext
            or fulltext
            or from_year
            or to_year
            or from_timestamp
            or to_timestamp
            or title
            or ddk
            or lang
        ):
            self.corpus = document_corpus(
                doctype,
                author,
                freetext,
                fulltext,
                from_year,
                to_year,
                from_timestamp,
                to_timestamp,
                title,
                ddk,
                subject,
                lang,
                limit,
                order_by,
            )

        else:
            self.corpus = pd.DataFrame(columns=["urn"])

        super().__init__(self.corpus)
        # self.size = len(self.corpus)

        self.frame.rename(
            columns={
                "urn": "urn",
                "authors": "author",
                "langs": "language",
                "genres": "genre",
            },
            inplace=True,
        )


[docs]
    @classmethod
    def from_identifiers(cls, identifiers: List[Union[str, int]]):
        """Construct Corpus from list of identifiers"""
        corpus = Corpus()
        corpus.extend_from_identifiers(identifiers=identifiers)
        return corpus



[docs]
    @classmethod
    def from_df(cls, df: DataFrame, check_for_urn: bool = False):
        """Typecast Pandas DataFrame to Corpus class

        DataFrame most contain URN column"""

        # If Series, return as is
        if isinstance(df, pd.Series):
            return df

        df = df.copy()  # Avoid modifying original DataFrame

        corpus = Corpus()
        if check_for_urn:
            corpus.corpus = cls._urn_id_in_dataframe_cols(df)
        else:
            corpus.corpus = df
        corpus.frame = corpus.corpus
        return corpus



[docs]
    @classmethod
    def from_csv(cls, path: str):
        """Import corpus from csv"""
        df = pd.read_csv(path)
        return cls.from_df(df)


    @staticmethod
    def _urn_id_in_dataframe_cols(dataframe: Union[DataFrame, type("Corpus")]) -> DataFrame:
        """Checks if dataframe contains URN column"""
        if "urn" in dataframe.columns:
            if dataframe.urn.str.contains("^URN:NBN:no-nb_.+").all():
                return dataframe
        raise ValueError("No'urn'-column in dataframe.")


[docs]
    def extend_from_identifiers(self, identifiers: list = None):
        new_corpus = get_metadata(urnlist(identifiers))
        self.add(new_corpus)



[docs]
    def evaluate_words(self, wordbags=None):
        df = evaluate_documents(wordbags=wordbags, urns=list(self.corpus.urn))
        df.index = df.index.astype(int)
        cols = df.columns
        df = pd.concat(
            [df, self.corpus[["dhlabid", "urn"]].set_index("dhlabid")], axis=1
        )
        df = df.set_index("urn")
        return df[cols].fillna(0)



[docs]
    def add(self, new_corpus: Union[DataFrame, type("Corpus")]):
        """Utility for appending Corpus or DataFrame to self"""
        if self._is_Corpus(new_corpus):
            new_corpus = new_corpus.frame
        self.frame = (
            pd.concat([self.frame, new_corpus]).drop_duplicates().reset_index(drop=True)
        )
        self.corpus = self.frame

        # self.size = len(self.frame)


[docs]
    def sample(self, n: int = 5):
        """Create random subkorpus with `n` entries"""
        n = min(n, self.size)
        sample = self.corpus.sample(n).copy()
        return self.from_df(sample)



[docs]
    def only_one_author(self):
        """Only select items with one author"""
        mask = self.frame.author.apply(lambda x: len(x.split('/'))) == 1
        return self.from_df(self.frame[mask])



[docs]
    def only_one_language(self):
        """Only select items with one language"""
        mask = self.frame.language.apply(lambda x: len(x.split('/'))) == 1
        return self.from_df(self.frame[mask])



[docs]
    def conc(self, words, window: int = 20, limit: int = 500) -> dh.Concordance:
        """Get concodances of `words` in corpus"""
        return dh.Concordance(
            corpus=self.frame, query=words, window=window, limit=limit
        )



[docs]
    def coll(
        self,
        words=None,
        before=10,
        after=10,
        reference=None,
        samplesize=20000,
        alpha=False,
        ignore_caps=False,
    ) -> dh.Collocations:
        """Get collocations of `words` in corpus"""
        return dh.Collocations(
            corpus=self.frame,
            words=words,
            before=before,
            after=after,
            reference=reference,
            samplesize=samplesize,
            alpha=alpha,
            ignore_caps=ignore_caps,
        )



[docs]
    def count(self, words=None):
        """Get word frequencies for corpus"""
        return dh.Counts(self, words)



[docs]
    def freq(self, words=None):
        """Get word frequencies for corpus"""
        return dh.Counts(self, words)


    @staticmethod
    def _is_Corpus(corpus: "Corpus") -> bool:
        """Check if `input` is Corpus or DataFrame"""
        if type(corpus) not in [DataFrame, Corpus]:
            raise TypeError("Input is not Corpus or DataFrame")
        return isinstance(corpus, Corpus)

    def __add__(self, other):
        """Add two Corpus objects"""
        if not self._is_Corpus(other):
            raise TypeError("Input is not Corpus or DataFrame")
        return self.from_df(
            pd.concat([self.frame, other.frame])
            .drop_duplicates()
            .reset_index(drop=True)
        )

    def _make_subcorpus(self, **kwargs) -> "Corpus":
        dct = kwargs.copy()
        year_range = dct.pop("year_range", None)

        for key in dct.keys():
            if key not in self.frame.columns:
                print(f"Key {key} not in corpus")
                return None

        # Make result dataframe
        res = self.frame.copy()

        # Get year range
        if year_range is not None:
            y1 = int(year_range[0])
            y2 = int(year_range[1])

            # Apply year range
            res = res.loc[res["year"] >= y1].loc[res["year"] <= y2]

        for key, val in dct.items():
            res = res.loc[res[key].str.contains(val)]

        return self.from_df(res)


[docs]
    def make_subcorpus(self, authors: str = None, title: str = None) -> "Corpus":
        """Make subcorpus based on author and title

        Args:
            authors (str, optional): search for author field. Defaults to None.
            title (str, optional): search title field. Defaults to None.

        Returns:
            Corpus: A subset of the original corpus
        """
        dct = {}
        if authors is not None:
            dct["author"] = authors
        if title is not None:
            dct["title"] = title

        return self._make_subcorpus(**dct)



[docs]
    def check_integrity(self):
        """Check the integrity of the corpus data."""

        def test_dhlabid_series(series: pd.Series) -> bool:
            """Check if dhlabid series is valid"""
            if not series.apply(lambda x: isinstance(x, int)).all():
                return False
            if not ((series >= 1e8) & (series < 1e9)).all():
                return False

            return True

        def test_urn_series(series: pd.Series) -> bool:
            """Check if URN series is valid"""
            if series.str.startswith("URN:NBN:no-nb_").all():
                return True
            try:
                series = series.apply(lambda x: int(x))
                return test_dhlabid_series(series)
            except:
                return False

        # Check if the DataFrame is empty
        if self.corpus.empty:
            raise ValueError("Corpus is empty.")

        # Check for the presence of essential columns
        required_columns = ["urn", "dhlabid", "author", "language", "genre"]
        for col in required_columns:
            if col not in self.corpus.columns:
                raise ValueError(f"Essential column '{col}' is missing.")

        # Validate dhlabid format
        if not test_dhlabid_series(self.corpus["dhlabid"]):
            raise ValueError("Some dhlabid values are in an incorrect format.")

        # Validate URN format
        if not test_urn_series(self.corpus["urn"]):
            raise ValueError("Some URN values are in an incorrect format.")

        return True