Skip to content

Commit

Permalink
Merge pull request #926 from PrimozGodec/lagdetect-nyt
Browse files Browse the repository at this point in the history
[ENH] NYTimes - add language to corpus
  • Loading branch information
VesnaT authored Apr 7, 2023
2 parents ac3aeb9 + 3327bb4 commit 07b0cbf
Show file tree
Hide file tree
Showing 4 changed files with 100 additions and 48 deletions.
113 changes: 79 additions & 34 deletions orangecontrib/text/nyt.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,24 @@
import shelve
import warnings
from datetime import date
from time import sleep
from urllib import request, parse
from functools import partial
from http.client import HTTPException
from time import sleep
from urllib import parse, request
from urllib.error import HTTPError, URLError

import numpy as np
from dateutil.parser import isoparse
from Orange.data import (
ContinuousVariable,
DiscreteVariable,
StringVariable,
TimeVariable,
)
from Orange.misc import environ

from Orange import data
from orangecontrib.text.corpus import Corpus

try:
from Orange.misc import environ
except ImportError:
from Orange.canvas.utils import environ
from orangecontrib.text.util import create_corpus

SLEEP = 1
TIMEOUT = 10
Expand All @@ -26,39 +31,70 @@
BASE_URL = 'http://api.nytimes.com/svc/search/v2/articlesearch.json'


class NYT:
""" Class for fetching records from the NYT API. """
def keywords(doc, name):
kws = doc.get("keywords", [])
return ", ".join([kw.get("value") for kw in kws if kw["name"] == name])

@staticmethod
def keywords(doc, name):
return ', '.join([kw.get('value')
for kw in doc.get('keywords', [])
if kw['name'] == name])

attributes = []
def parse_date(doc):
date = doc.get("pub_date")
return isoparse(date).timestamp() if date is not None else np.nan


class NYT:
""" Class for fetching records from the NYT API. """

class_vars = [
(data.DiscreteVariable('Section'), lambda doc: doc.get('section_name', None)),
(
partial(DiscreteVariable, "Section"),
lambda doc: doc.get("section_name", None),
),
]

tv = data.TimeVariable('Publication Date')
metas = [
(data.StringVariable('Headline'), lambda doc: doc.get('headline', {}).get('main') or ''),
(data.StringVariable('Abstract'), lambda doc: doc.get('abstract') or ''),
(data.StringVariable('Snippet'), lambda doc: doc.get('snippet') or ''),
(data.StringVariable('Lead Paragraph'), lambda doc: doc.get('lead_paragraph') or ''),
(data.StringVariable('Subject Keywords'), lambda doc: NYT.keywords(doc, 'subject')),
(data.StringVariable('URL'), lambda doc: doc.get('web_url') or ''),
(data.StringVariable('Locations'), lambda doc: NYT.keywords(doc, 'glocations')),
(data.StringVariable('Persons'), lambda doc: NYT.keywords(doc, 'persons')),
(data.StringVariable('Organizations'), lambda doc: NYT.keywords(doc, 'organizations')),
(data.StringVariable('Creative Works'), lambda doc: NYT.keywords(doc, 'creative_works')),
(tv, lambda doc: NYT.tv.parse(doc.get('pub_date'))),
(data.DiscreteVariable('Article Type'), lambda doc: doc.get('type_of_material', None)),
(data.ContinuousVariable('Word Count', number_of_decimals=0), lambda doc: doc.get('word_count', None)),
(
partial(StringVariable, "Headline"),
lambda doc: doc.get("headline", {}).get("main") or "",
),
(partial(StringVariable, "Abstract"), lambda doc: doc.get("abstract") or ""),
(partial(StringVariable, "Snippet"), lambda doc: doc.get("snippet") or ""),
(
partial(StringVariable, "Lead Paragraph"),
lambda doc: doc.get("lead_paragraph") or "",
),
(
partial(StringVariable, "Subject Keywords"),
partial(keywords, name="subject"),
),
(partial(StringVariable, "URL"), lambda doc: doc.get("web_url") or ""),
(
partial(StringVariable, "Locations"),
partial(keywords, name="glocations"),
),
(partial(StringVariable, "Persons"), partial(keywords, name="persons")),
(
partial(StringVariable, "Organizations"),
partial(keywords, name="organizations"),
),
(
partial(StringVariable, "Creative Works"),
partial(keywords, name="creative_works"),
),
(
partial(TimeVariable, "Publication Date", have_time=1, have_date=1),
parse_date,
),
(
partial(DiscreteVariable, "Article Type"),
lambda doc: doc.get("type_of_material", None),
),
(
partial(ContinuousVariable, "Word Count", number_of_decimals=0),
lambda doc: doc.get("word_count", None),
),
]

text_features = [metas[0][0], metas[1][0]] # headline + abstract
text_features = ["Headline", "Abstract"]

def __init__(self, api_key):
"""
Expand Down Expand Up @@ -129,8 +165,17 @@ def search(self, query, date_from=None, date_to=None, max_docs=None,
if len(records) > max_docs:
records = records[:max_docs]

return Corpus.from_documents(records, 'NY Times', self.attributes,
self.class_vars, self.metas, title_indices=[-1])
corpus = create_corpus(
documents=records,
attributes=[],
class_vars=self.class_vars,
metas=self.metas,
title_indices=[-1],
text_features=self.text_features,
name="NY Times",
)
corpus.attributes["language"] = "en" # NYT publishes only in English
return corpus

def _cache_init(self):
""" Initialize cache in Orange environment buffer dir. """
Expand Down
23 changes: 14 additions & 9 deletions orangecontrib/text/tests/test_nyt.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from http.client import HTTPException
from urllib.error import HTTPError, URLError

from Orange.data import TimeVariable
from orangecontrib.text import Corpus
from orangecontrib.text.nyt import NYT, BATCH_SIZE

Expand Down Expand Up @@ -87,6 +86,7 @@ def test_nyt_key(self):

def test_nyt_query_keywords(self):
c = self.nyt.search('slovenia', max_docs=10)
self.assertEqual(c.language, "en")
self.assertIsInstance(c, Corpus)
self.assertEqual(len(c), 10)

Expand All @@ -96,8 +96,8 @@ def test_nyt_query_date_range(self):
corpus = self.nyt.search('slovenia', from_date, to_date, max_docs=10)
self.assertEqual(len(corpus), 10)

time_index = next(i for i, (var, _) in enumerate(NYT.metas) if isinstance(var, TimeVariable))
tv = corpus.domain.metas[time_index]
tv = corpus.domain["Publication Date"]
time_index = corpus.domain.metas.index(tv)
for doc in corpus:
date = tv.repr_val(doc.metas[time_index])
date = datetime.datetime.strptime(date, '%Y-%m-%d %H:%M:%S').date()
Expand All @@ -108,15 +108,20 @@ def test_nyt_query_date_range(self):
def test_nyt_query_max_records(self):
c = self.nyt.search('slovenia', max_docs=25)
self.assertEqual(len(c), 25)
self.assertEqual(c.language, "en")

def test_nyt_corpus_domain_generation(self):
corpus = self.nyt.search('slovenia', max_docs=10)
for var, _ in NYT.attributes:
self.assertIn(var, corpus.domain.attributes)
for var, _ in NYT.class_vars:
self.assertIn(var, corpus.domain.class_vars)
for var, _ in NYT.metas:
self.assertIn(var, corpus.domain.metas)
self.assertTupleEqual((), corpus.domain.attributes)
self.assertListEqual(
[var.args[0] for var, _ in NYT.class_vars],
[var.name for var in corpus.domain.class_vars]
)
self.assertListEqual(
[var.args[0] for var, _ in NYT.metas],
[var.name for var in corpus.domain.metas]
)
self.assertEqual(corpus.language, "en")

def test_nyt_result_caching(self):
self.nyt._fetch_page('slovenia', None, None, 0) # assure in cache
Expand Down
10 changes: 6 additions & 4 deletions orangecontrib/text/util.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from functools import wraps
from math import ceil
from typing import Union, List, Callable, Any, Tuple
from typing import Union, List, Callable, Any, Tuple, Optional

import numpy as np
import scipy.sparse as sp
Expand Down Expand Up @@ -102,10 +102,10 @@ def create_corpus(
title_indices: List[int],
text_features: List[str],
name: str,
language_attribute: str,
language_attribute: Optional[str] = None,
):
"""
Create a corpus from list of features/documents produced by modelu such as
Create a corpus from list of features/documents produced by model such as
Guardian/NYT
Parameters
Expand Down Expand Up @@ -160,7 +160,9 @@ def to_val(attr, val):
Y = np.array(Y, dtype=np.float64)
metas = np.array(metas, dtype=object)

language = infer_language_from_variable(domain[language_attribute])
language = None
if language_attribute is not None:
language = infer_language_from_variable(domain[language_attribute])
corpus = Corpus.from_numpy(
domain=domain,
X=X,
Expand Down
2 changes: 1 addition & 1 deletion orangecontrib/text/widgets/ownyt.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ class Outputs:
date_to = Setting(datetime.now().date())

attributes = [feat.name for feat, _ in NYT.metas if isinstance(feat, StringVariable)]
text_includes = Setting([feat.name for feat in NYT.text_features])
text_includes = Setting([NYT.text_features])

class Warning(OWWidget.Warning):
no_text_fields = Msg('Text features are inferred when none are selected.')
Expand Down

0 comments on commit 07b0cbf

Please sign in to comment.