Skip to content

Commit

Permalink
NYT - language to corpus and fresh variable initialization
Browse files Browse the repository at this point in the history
  • Loading branch information
PrimozGodec committed Mar 24, 2023
1 parent 3a9cb5a commit 76f8e6c
Show file tree
Hide file tree
Showing 4 changed files with 93 additions and 49 deletions.
108 changes: 73 additions & 35 deletions orangecontrib/text/nyt.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,24 @@
import shelve
import warnings
from datetime import date
from time import sleep
from urllib import request, parse
from functools import partial
from http.client import HTTPException
from time import sleep
from urllib import parse, request
from urllib.error import HTTPError, URLError

import numpy as np
from dateutil.parser import isoparse
from Orange.data import (
ContinuousVariable,
DiscreteVariable,
StringVariable,
TimeVariable,
)
from Orange.misc import environ

from Orange import data
from orangecontrib.text.corpus import Corpus

try:
from Orange.misc import environ
except ImportError:
from Orange.canvas.utils import environ
from orangecontrib.text.util import create_corpus

SLEEP = 1
TIMEOUT = 10
Expand All @@ -31,34 +36,65 @@ class NYT:

@staticmethod
def keywords(doc, name):
return ', '.join([kw.get('value')
for kw in doc.get('keywords', [])
if kw['name'] == name])
kws = doc.get("keywords", [])
return ", ".join([kw.get("value") for kw in kws if kw["name"] == name])

attributes = []
@staticmethod
def parse_date(doc):
date = doc.get("pub_date")
return isoparse(date).timestamp() if date is not None else np.nan

class_vars = [
(data.DiscreteVariable('Section'), lambda doc: doc.get('section_name', None)),
(
partial(DiscreteVariable, "Section"),
lambda doc: doc.get("section_name", None),
),
]

tv = data.TimeVariable('Publication Date')
metas = [
(data.StringVariable('Headline'), lambda doc: doc.get('headline', {}).get('main') or ''),
(data.StringVariable('Abstract'), lambda doc: doc.get('abstract') or ''),
(data.StringVariable('Snippet'), lambda doc: doc.get('snippet') or ''),
(data.StringVariable('Lead Paragraph'), lambda doc: doc.get('lead_paragraph') or ''),
(data.StringVariable('Subject Keywords'), lambda doc: NYT.keywords(doc, 'subject')),
(data.StringVariable('URL'), lambda doc: doc.get('web_url') or ''),
(data.StringVariable('Locations'), lambda doc: NYT.keywords(doc, 'glocations')),
(data.StringVariable('Persons'), lambda doc: NYT.keywords(doc, 'persons')),
(data.StringVariable('Organizations'), lambda doc: NYT.keywords(doc, 'organizations')),
(data.StringVariable('Creative Works'), lambda doc: NYT.keywords(doc, 'creative_works')),
(tv, lambda doc: NYT.tv.parse(doc.get('pub_date'))),
(data.DiscreteVariable('Article Type'), lambda doc: doc.get('type_of_material', None)),
(data.ContinuousVariable('Word Count', number_of_decimals=0), lambda doc: doc.get('word_count', None)),
(
partial(StringVariable, "Headline"),
lambda doc: doc.get("headline", {}).get("main") or "",
),
(partial(StringVariable, "Abstract"), lambda doc: doc.get("abstract") or ""),
(partial(StringVariable, "Snippet"), lambda doc: doc.get("snippet") or ""),
(
partial(StringVariable, "Lead Paragraph"),
lambda doc: doc.get("lead_paragraph") or "",
),
(
partial(StringVariable, "Subject Keywords"),
lambda doc: NYT.keywords(doc, "subject"),
),
(partial(StringVariable, "URL"), lambda doc: doc.get("web_url") or ""),
(
partial(StringVariable, "Locations"),
lambda doc: NYT.keywords(doc, "glocations"),
),
(partial(StringVariable, "Persons"), lambda doc: NYT.keywords(doc, "persons")),
(
partial(StringVariable, "Organizations"),
lambda doc: NYT.keywords(doc, "organizations"),
),
(
partial(StringVariable, "Creative Works"),
lambda doc: NYT.keywords(doc, "creative_works"),
),
(
partial(TimeVariable, "Publication Date", have_time=1, have_date=1),
parse_date,
),
(
partial(DiscreteVariable, "Article Type"),
lambda doc: doc.get("type_of_material", None),
),
(
partial(ContinuousVariable, "Word Count", number_of_decimals=0),
lambda doc: doc.get("word_count", None),
),
]

text_features = [metas[0][0], metas[1][0]] # headline + abstract
text_features = ["Headline", "Abstract"]

def __init__(self, api_key):
"""
Expand Down Expand Up @@ -129,15 +165,17 @@ def search(self, query, date_from=None, date_to=None, max_docs=None,
if len(records) > max_docs:
records = records[:max_docs]

return Corpus.from_documents(
records,
"NY Times",
self.attributes,
self.class_vars,
self.metas,
corpus = create_corpus(
documents=records,
attributes=[],
class_vars=self.class_vars,
metas=self.metas,
title_indices=[-1],
language="en",
text_features=self.text_features,
name="NY Times",
)
corpus.attributes["language"] = "en" # NYT publishes only in English
return corpus

def _cache_init(self):
""" Initialize cache in Orange environment buffer dir. """
Expand Down
22 changes: 13 additions & 9 deletions orangecontrib/text/tests/test_nyt.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from http.client import HTTPException
from urllib.error import HTTPError, URLError

from Orange.data import TimeVariable
from orangecontrib.text import Corpus
from orangecontrib.text.nyt import NYT, BATCH_SIZE

Expand Down Expand Up @@ -97,8 +96,8 @@ def test_nyt_query_date_range(self):
corpus = self.nyt.search('slovenia', from_date, to_date, max_docs=10)
self.assertEqual(len(corpus), 10)

time_index = next(i for i, (var, _) in enumerate(NYT.metas) if isinstance(var, TimeVariable))
tv = corpus.domain.metas[time_index]
tv = corpus.domain["Publication Date"]
time_index = corpus.domain.metas.index(tv)
for doc in corpus:
date = tv.repr_val(doc.metas[time_index])
date = datetime.datetime.strptime(date, '%Y-%m-%d %H:%M:%S').date()
Expand All @@ -109,15 +108,20 @@ def test_nyt_query_date_range(self):
def test_nyt_query_max_records(self):
c = self.nyt.search('slovenia', max_docs=25)
self.assertEqual(len(c), 25)
self.assertEqual(c.language, "en")

def test_nyt_corpus_domain_generation(self):
corpus = self.nyt.search('slovenia', max_docs=10)
for var, _ in NYT.attributes:
self.assertIn(var, corpus.domain.attributes)
for var, _ in NYT.class_vars:
self.assertIn(var, corpus.domain.class_vars)
for var, _ in NYT.metas:
self.assertIn(var, corpus.domain.metas)
self.assertTupleEqual((), corpus.domain.attributes)
self.assertListEqual(
[var.args[0] for var, _ in NYT.class_vars],
[var.name for var in corpus.domain.class_vars]
)
self.assertListEqual(
[var.args[0] for var, _ in NYT.metas],
[var.name for var in corpus.domain.metas]
)
self.assertEqual(corpus.language, "en")

def test_nyt_result_caching(self):
self.nyt._fetch_page('slovenia', None, None, 0) # assure in cache
Expand Down
10 changes: 6 additions & 4 deletions orangecontrib/text/util.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from functools import wraps
from math import ceil
from typing import Union, List, Callable, Any, Tuple
from typing import Union, List, Callable, Any, Tuple, Optional

import numpy as np
import scipy.sparse as sp
Expand Down Expand Up @@ -102,10 +102,10 @@ def create_corpus(
title_indices: List[int],
text_features: List[str],
name: str,
language_attribute: str,
language_attribute: Optional[str] = None,
):
"""
Create a corpus from list of features/documents produced by modelu such as
Create a corpus from list of features/documents produced by model such as
Guardian/NYT
Parameters
Expand Down Expand Up @@ -160,7 +160,9 @@ def to_val(attr, val):
Y = np.array(Y, dtype=np.float64)
metas = np.array(metas, dtype=object)

language = infer_language_from_variable(domain[language_attribute])
language = None
if language_attribute is not None:
language = infer_language_from_variable(domain[language_attribute])
corpus = Corpus.from_numpy(
domain=domain,
X=X,
Expand Down
2 changes: 1 addition & 1 deletion orangecontrib/text/widgets/ownyt.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ class Outputs:
date_to = Setting(datetime.now().date())

attributes = [feat.name for feat, _ in NYT.metas if isinstance(feat, StringVariable)]
text_includes = Setting([feat.name for feat in NYT.text_features])
text_includes = Setting([NYT.text_features])

class Warning(OWWidget.Warning):
no_text_fields = Msg('Text features are inferred when none are selected.')
Expand Down

0 comments on commit 76f8e6c

Please sign in to comment.