Guardian - infer language from corpus

biolab · Jan 11, 2023 · 037148b · 037148b
1 parent 25952ea
commit 037148b
Show file tree

Hide file tree

Showing 4 changed files with 215 additions and 60 deletions.
diff --git a/orangecontrib/text/guardian.py b/orangecontrib/text/guardian.py
@@ -14,16 +14,21 @@
     10
 
 """
-
-import requests
 import math
 import json
 import os
+from functools import partial
 
-from Orange import data
-
-from orangecontrib.text.corpus import Corpus
+import requests
+from Orange.data import (
+    StringVariable,
+    DiscreteVariable,
+    ContinuousVariable,
+    TimeVariable,
+)
+from dateutil.parser import isoparse
 
+from orangecontrib.text.util import create_corpus
 
 BASE_URL = 'http://content.guardianapis.com/search'
 ARTICLES_PER_PAGE = 10
@@ -53,29 +58,33 @@ def __eq__(self, other):
 
 
 class TheGuardianAPI:
-    attributes = []
-
     class_vars = [
-        (data.DiscreteVariable('Section'), lambda doc: doc['sectionName']),
+        (partial(DiscreteVariable, "Section"), lambda doc: doc["sectionName"]),
     ]
 
-    tv = data.TimeVariable('Publication Date')
     metas = [
-        (data.StringVariable('Headline'), lambda doc: doc['fields']['headline']),
-        (data.StringVariable('Content'), lambda doc: doc['fields']['bodyText']),
-        (data.StringVariable('Trail Text'), lambda doc: doc['fields']['trailText']),
-        (data.StringVariable('HTML'), lambda doc: doc['fields']['body']),
-        (tv, lambda doc: TheGuardianAPI.tv.parse(doc['webPublicationDate'])),
-        (data.DiscreteVariable('Type'), lambda doc: doc['type']),
-        (data.DiscreteVariable('Language'), lambda doc: doc['fields']['lang']),
-        (data.StringVariable('Tags'),
-            lambda doc: ', '.join(tag['webTitle'] for tag in doc['tags'])),
-        (data.StringVariable('URL'), lambda doc: doc['webUrl']),
-        (data.ContinuousVariable('Word Count', number_of_decimals=0),
-            lambda doc: doc['fields']['wordcount']),
+        (partial(StringVariable, "Headline"), lambda doc: doc["fields"]["headline"]),
+        (partial(StringVariable, "Content"), lambda doc: doc["fields"]["bodyText"]),
+        (partial(StringVariable, "Trail Text"), lambda doc: doc["fields"]["trailText"]),
+        (partial(StringVariable, "HTML"), lambda doc: doc["fields"]["body"]),
+        (
+            partial(TimeVariable, "Publication Date"),
+            lambda doc: isoparse(doc["webPublicationDate"]).timestamp(),
+        ),
+        (partial(DiscreteVariable, "Type"), lambda doc: doc["type"]),
+        (partial(DiscreteVariable, "Language"), lambda doc: doc["fields"]["lang"]),
+        (
+            partial(StringVariable, "Tags"),
+            lambda doc: ", ".join(tag["webTitle"] for tag in doc["tags"]),
+        ),
+        (partial(StringVariable, "URL"), lambda doc: doc["webUrl"]),
+        (
+            partial(ContinuousVariable, "Word Count", number_of_decimals=0),
+            lambda doc: doc["fields"]["wordcount"],
+        ),
     ]
 
-    text_features = [metas[0][0], metas[1][0]]  # Headline + Content
+    text_features = ["Headline", "Content"]  #
     title_indices = [-1]    # Headline
 
     def __init__(self, credentials, on_progress=None, should_break=None):
@@ -156,11 +165,16 @@ def search(self, query, from_date=None, to_date=None, max_documents=None,
             self._search(query, from_date, to_date, p)
             self.on_progress(p*self.per_page, pages * self.per_page)
 
-        c = Corpus.from_documents(
-            self.results, 'The Guardian', self.attributes, self.class_vars,
-            self.metas, title_indices=self.title_indices)
-        c.text_features = self.text_features
-        return c
+        return create_corpus(
+            self.results,
+            [],
+            self.class_vars,
+            self.metas,
+            self.title_indices,
+            self.text_features,
+            "The Guardian",
+            "Language",
+        )
 
 
 if __name__ == '__main__':

diff --git a/orangecontrib/text/tests/test_guardian.py b/orangecontrib/text/tests/test_guardian.py
@@ -3,11 +3,70 @@
 
 from datetime import date, datetime
 from unittest import mock
+from unittest.mock import Mock
 
 from orangecontrib.text import guardian
 
 
-API_KEY = os.getenv('THE_GUARDIAN_API_KEY', 'test')
+API_KEY = os.getenv("THE_GUARDIAN_API_KEY", "test")
+responses = [
+    """
+{
+  "response": {
+    "pages": 2,
+    "results": [
+      {
+        "type": "article",
+        "sectionName": "World news",
+        "webPublicationDate": "2018-07-05T23:27:25Z",
+        "webUrl": "https://www.theguardian.com/world/2018/jul/06",
+        "fields": {
+          "headline": "Rohingya refugees reject UN-Myanmar repatriati",
+          "trailText": "Leaders say agreement does not address concer",
+          "body": "<p><strong><strong><strong></strong></strong></str",
+          "wordcount": "512",
+          "lang": "en",
+          "bodyText": "Rohingya community leaders have rejected an."
+        },
+        "tags": [
+          {
+            "webTitle": "Myanmar"
+          }
+        ]
+      }
+    ]
+  }
+}
+""",
+    """
+{
+  "response": {
+    "pages": 2,
+    "results": [
+      {
+        "type": "article",
+        "sectionName": "World news",
+        "webPublicationDate": "2018-07-05T23:27:25Z",
+        "webUrl": "https://www.theguardian.com/world/2018/jul/06",
+        "fields": {
+          "headline": "Rohingya refugees reject UN-Myanmar repatriati",
+          "trailText": "Leaders say agreement does not address concer",
+          "body": "<p><strong><strong><strong></strong></strong></str",
+          "wordcount": "512",
+          "lang": "fr",
+          "bodyText": "Rohingya community leaders have rejected an."
+        },
+        "tags": [
+          {
+            "webTitle": "Myanmar"
+          }
+        ]
+      }
+    ]
+  }
+}
+""",
+]
 
 
 class TestCredentials(unittest.TestCase):
@@ -79,33 +138,31 @@ def test_api_limit_error(self, mock_get):
 
     @mock.patch('requests.get')
     def test_search_mock_data(self, mock_get):
-        mock_get().text = """
-        {
-          "response": {
-            "pages": 2,
-            "results": [
-              {
-                "type": "article",
-                "sectionName": "World news",
-                "webPublicationDate": "2018-07-05T23:27:25Z",
-                "webUrl": "https://www.theguardian.com/world/2018/jul/06",
-                "fields": {
-                  "headline": "Rohingya refugees reject UN-Myanmar repatriati",
-                  "trailText": "Leaders say agreement does not address concer",
-                  "body": "<p><strong><strong><strong></strong></strong></str",
-                  "wordcount": "512",
-                  "lang": "en",
-                  "bodyText": "Rohingya community leaders have rejected an."
-                },
-                "tags": [
-                  {
-                    "webTitle": "Myanmar"
-                  }
-                ]
-              }
-            ]
-          }
-        }
-        """
+        mock_get.return_value.text = responses[0]
         corp = self.api.search('Slovenia')
         self.assertEqual(len(corp), 2)
+
+    @mock.patch("requests.get")
+    def test_tweets_language(self, mock_get):
+        mms = []
+        for r in responses:
+            mms.append(Mock())
+            mms[-1].text = r
+
+        mock_get.side_effect = mms
+        # language should be None returned articles have different languages
+        corpus = self.api.search("Slovenia")
+        self.assertIsNone(corpus.language)
+
+        mock_get.side_effect = [mms[0], mms[0]]
+        # corpus language should be set since articles have same language
+        corpus = self.api.search("Slovenia")
+        self.assertEqual("en", corpus.language)
+
+        mock_get.side_effect = [mms[1], mms[1]]
+        corpus = self.api.search("Slovenia")
+        self.assertEqual("fr", corpus.language)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/orangecontrib/text/util.py b/orangecontrib/text/util.py
@@ -1,11 +1,15 @@
 from functools import wraps
 from math import ceil
-from typing import Union, List
+from typing import Union, List, Callable, Any, Tuple
 
 import numpy as np
 import scipy.sparse as sp
+from Orange.data import Domain, DiscreteVariable
 from gensim.matutils import Sparse2Corpus
 
+from orangecontrib.text import Corpus
+from orangecontrib.text.language import infer_language_from_variable
+
 
 def chunks(iterable, chunk_size):
     """ Splits iterable objects into chunk of fixed size.
@@ -88,3 +92,82 @@ def __getitem__(
         """
         sparse = self.sparse.__getitem__((slice(None, None, None), key))
         return Sparse2CorpusSliceable(sparse)
+
+
+def create_corpus(
+    documents: List[Any],
+    attributes: List[Tuple[Callable, Callable]],
+    class_vars: List[Tuple[Callable, Callable]],
+    metas: List[Tuple[Callable, Callable]],
+    title_indices: List[int],
+    text_features: List[str],
+    name: str,
+    language_attribute: str,
+):
+    """
+    Create a corpus from list of features/documents produced by modelu such as
+    Guardian/NYT
+
+    Parameters
+    ----------
+    documents
+        List with values downloaded from API
+    attributes
+        List of attributes and recipes on how to extract values from documents.
+    class_vars
+        List of class attributes and recipes on how to extract values from documents.
+    metas
+        List of meta and recipes on how to extract values from documents.
+    title_indices
+        The index of the title attribute.
+    text_features
+        Names of text features
+    name
+        The name of the Corpus
+    language_attribute
+        The attribute to infer the language from.
+
+    Returns
+    -------
+    Corpus with documents.
+    """
+    domain = Domain(
+        attributes=[attr() for attr, _ in attributes],
+        class_vars=[attr() for attr, _ in class_vars],
+        metas=[attr() for attr, _ in metas],
+    )
+    for ind in title_indices:
+        domain[ind].attributes["title"] = True
+
+    def to_val(attr, val):
+        if isinstance(attr, DiscreteVariable):
+            attr.val_from_str_add(val)
+        return attr.to_val(val)
+
+    X = [
+        [to_val(a, f(doc)) for a, (_, f) in zip(domain.class_vars, attributes)]
+        for doc in documents
+    ]
+    Y = [
+        [to_val(a, f(doc)) for a, (_, f) in zip(domain.class_vars, class_vars)]
+        for doc in documents
+    ]
+    metas = [
+        [to_val(a, f(doc)) for a, (_, f) in zip(domain.metas, metas)]
+        for doc in documents
+    ]
+    X = np.array(X, dtype=np.float64)
+    Y = np.array(Y, dtype=np.float64)
+    metas = np.array(metas, dtype=object)
+
+    language = infer_language_from_variable(domain[language_attribute])
+    corpus = Corpus.from_numpy(
+        domain=domain,
+        X=X,
+        Y=Y,
+        metas=metas,
+        text_features=[domain[f] for f in text_features],
+        language=language,
+    )
+    corpus.name = name
+    return corpus
diff --git a/orangecontrib/text/widgets/owguardian.py b/orangecontrib/text/widgets/owguardian.py
@@ -78,9 +78,10 @@ class Outputs:
     recent_queries = Setting([])
     date_from = Setting((datetime.now().date() - timedelta(365)))
     date_to = Setting(datetime.now().date())
-    attributes = [feat.name for feat, _ in TheGuardianAPI.metas if
-                  isinstance(feat, StringVariable)]
-    text_includes = Setting([feat.name for feat in TheGuardianAPI.text_features])
+    attributes = [
+        part.args[0] for part, _ in TheGuardianAPI.metas if part.func is StringVariable
+    ]
+    text_includes = Setting([feat for feat in TheGuardianAPI.text_features])
 
     class Warning(OWWidget.Warning):
         no_text_fields = Msg('Text features are inferred when none are selected.')