Source code for dhlab.text.geo_data

import pandas as pd

from dhlab.api.dhlab_api import geo_lookup, get_places, ner_from_urn
from dhlab.text.parse import NER, Models


[docs] class GeoData: """Fetch place data from a text (book, newspaper or ...) identified by URN with an appropriate and available spacy model. The models are retrieved by instantiating :py:class:`~text.parse.Models`. """ def __init__(self, urn=None, model=None): self.place_names = self._fetch_place_names(urn, model) def _fetch_place_names(self, urn, model): try: assert urn is not None spacy = Models() model = model if model is not None else spacy.models[0] df = NER(urn = urn, model = model).ner place_names = df[df['ner'].str.contains('LOC')] except AssertionError: print("Please provide a document URN to fill the ``place_names`` dataframe attribute.") place_names = pd.DataFrame() except IndexError as error: print("GeoData couldn't load any SpaCy NER models.") place_names = pd.DataFrame() except Exception as error: print(error.__doc__, error) place_names = pd.DataFrame() return place_names
[docs] def add_geo_locations(self, feature_class = None, feature_code = None): """Get location data for the names in object, attribute self.place_names""" chunksize = 900 names = list(self.place_names.token) length = len(names) # GeoNames takes 1000 names at a time so chunk things up # Each GeoNames object has an attribute .places a pandas dataframe chunks = [ GeoNames( names[i:i+chunksize], feature_class = feature_class, feature_code = feature_code ).places for i in range(0, length, chunksize) ] self.places = pd.concat(chunks)
[docs] class GeoNames: """Fetch data from a list of names""" def __init__(self, names, feature_class = None, feature_code = None): self.places = geo_lookup(names, feature_class = feature_class, feature_code = feature_code)