Keep language code even if country code can't be normalised. Add a pr…

…iority to the result, so that we can keep the most specific match.
tilezen · Aug 19, 2016 · 4df4863 · 4df4863
1 parent 681429b
commit 4df4863
Show file tree

Hide file tree

Showing 2 changed files with 109 additions and 15 deletions.
diff --git a/test/test_transform.py b/test/test_transform.py
@@ -1,4 +1,5 @@
 import unittest
+from collections import OrderedDict
 
 
 class BuildingsClassTest(unittest.TestCase):
@@ -54,6 +55,8 @@ class L10nOsmTransformTest(unittest.TestCase):
     def _call_fut(self, x):
         from vectordatasource.transform import _convert_osm_l10n_name
         result = _convert_osm_l10n_name(x)
+        if result:
+            result = result.code
         return result
 
     def test_osm_convert_2_3(self):
@@ -73,8 +76,8 @@ def test_osm_convert_country(self):
         self.assertEquals(eng_gb, 'en_GB')
 
     def test_osm_convert_country_invalid(self):
-        not_found = self._call_fut('en_foo')
-        self.assertIsNone(not_found)
+        no_country = self._call_fut('en_foo')
+        self.assertEquals(no_country, 'en')
 
     def test_osm_convert_lookup(self):
         zh_min_nan = self._call_fut('zh-min-nan')
@@ -88,6 +91,8 @@ class L10nWofTransformTest(unittest.TestCase):
     def _call_fut(self, x):
         from vectordatasource.transform import _convert_wof_l10n_name
         result = _convert_wof_l10n_name(x)
+        if result:
+            result = result.code
         return result
 
     def test_osm_convert_valid(self):
@@ -128,6 +133,71 @@ def test_wof_source(self):
         self.assertEquals('foo', props['name:en'])
 
 
+class TagsPriorityI18nTest(unittest.TestCase):
+
+    def _call_fut(self, source, kvs):
+        from vectordatasource.transform import tags_name_i18n
+        shape = fid = zoom = None
+
+        # need to control the order of tags so that we can force the situation
+        # where one key overwrites another.
+        tags = OrderedDict()
+        for k, v in kvs.items():
+            tags['name:%s' % k] = v
+
+        props = dict(
+            source=source,
+            tags=tags,
+            name='unused',
+        )
+        result = tags_name_i18n(shape, props, fid, zoom)
+        return result
+
+    def test_wof_no_two_letter_code(self):
+        # given variants which have no 2-letter code (arq), then we should
+        # just be left with the ones which do (ara).
+        shape, props, fid = self._call_fut('whosonfirst.mapzen.com',
+                                           {'ara': 'foo', 'arq': 'bar'})
+        self.assertTrue('name:ar' in props)
+        self.assertFalse('name:ara' in props)
+        self.assertFalse('name:arq' in props)
+        self.assertEquals('foo', props['name:ar'])
+
+    def test_osm_invalid_country_code(self):
+        # given variants with an invalid or unrecognised country code, then
+        # we should keep any original which had no country code, as it is
+        # more specific.
+        langs = OrderedDict([
+            ('en',    'foo'),  # The One True Flavour of English.
+            ('en_GB', 'bar'),  # Also the correct flavour ;-)
+            ('en_AA', 'baz'),  # User-defined country code.
+            ('en_CT', 'bat'),  # Currently unassigned/deleted code.
+        ])
+        shape, props, fid = self._call_fut('openstreetmap.org', langs)
+
+        self.assertEquals('foo', props.get('name:en'))
+        self.assertEquals('bar', props.get('name:en_GB'))
+        self.assertFalse('name:en_AA' in props)
+        self.assertFalse('name:en_CT' in props)
+
+    def test_osm_invalid_country_code_reverse(self):
+        # same as the previous test, just checking that when the order of
+        # the keys is different (we wouldn't normally have control over it
+        # as it's in a dict), the result is the same.
+        langs = OrderedDict([
+            ('en_GB', 'bar'),
+            ('en_AA', 'baz'),
+            ('en_CT', 'bat'),
+            ('en',    'foo'),
+        ])
+        shape, props, fid = self._call_fut('openstreetmap.org', langs)
+
+        self.assertEquals('foo', props.get('name:en'))
+        self.assertEquals('bar', props.get('name:en_GB'))
+        self.assertFalse('name:en_AA' in props)
+        self.assertFalse('name:en_CT' in props)
+
+
 class DropFeaturesMinPixelsTest(unittest.TestCase):
 
     def _make_feature_layers(self, pixel_threshold, shape):

diff --git a/vectordatasource/transform.py b/vectordatasource/transform.py
@@ -1,6 +1,6 @@
 # transformation functions to apply to features
 
-from collections import defaultdict
+from collections import defaultdict, namedtuple
 from numbers import Number
 from shapely.geometry.collection import GeometryCollection
 from shapely.geometry import box as Box
@@ -341,6 +341,20 @@ def tags_remove(shape, properties, fid, zoom):
 )
 
 
+def _iso639_1_code_of(lang):
+    try:
+        iso639_1_code = lang.iso639_1_code.encode('utf-8')
+    except AttributeError:
+        return None
+    return iso639_1_code
+
+
+# a structure to return language code lookup results preserving the priority
+# (lower is better) of the result for use in situations where multiple inputs
+# can map to the same output.
+LangResult = namedtuple('LangResult', ['code', 'priority'])
+
+
 def _convert_wof_l10n_name(x):
     lang_str_iso_639_3 = x[:3]
     if len(lang_str_iso_639_3) != 3:
@@ -349,7 +363,7 @@ def _convert_wof_l10n_name(x):
         lang = pycountry.languages.get(iso639_3_code=lang_str_iso_639_3)
     except KeyError:
         return None
-    return lang.iso639_1_code
+    return LangResult(code=_iso639_1_code_of(lang), priority=0)
 
 
 def _normalize_osm_lang_code(x):
@@ -366,11 +380,7 @@ def _normalize_osm_lang_code(x):
                 lang = pycountry.languages.get(iso639_3_code=x)
             except KeyError:
                 return None
-    try:
-        iso639_1_code = lang.iso639_1_code.encode('utf-8')
-    except AttributeError:
-        return None
-    return iso639_1_code
+    return _iso639_1_code_of(lang)
 
 
 def _normalize_country_code(x):
@@ -397,7 +407,7 @@ def _normalize_country_code(x):
 
 def _convert_osm_l10n_name(x):
     if x in osm_l10n_lookup:
-        return x
+        return LangResult(code=x, priority=0)
 
     if '_' not in x:
         lang_code_candidate = x
@@ -411,17 +421,20 @@ def _convert_osm_l10n_name(x):
     if lang_code_result is None:
         return None
 
+    priority = 0
     if country_candidate:
         country_result = _normalize_country_code(country_candidate)
         if country_result is None:
-            return None
+            result = lang_code_result
+            priority = 1
 
-        result = '%s_%s' % (lang_code_result, country_result)
+        else:
+            result = '%s_%s' % (lang_code_result, country_result)
 
     else:
         result = lang_code_result
 
-    return result
+    return LangResult(code=result, priority=priority)
 
 
 def tags_name_i18n(shape, properties, fid, zoom):
@@ -451,16 +464,27 @@ def tags_name_i18n(shape, properties, fid, zoom):
         # become available.
         return shape, properties, fid
 
+    langs = {}
     for k, v in tags.items():
         if v == name:
             continue
         for candidate in alt_name_prefix_candidates:
+
             if k.startswith(candidate):
                 lang_code = k[len(candidate):]
                 normalized_lang_code = convert_fn(lang_code)
+
                 if normalized_lang_code:
-                    lang_key = '%s%s' % (candidate, normalized_lang_code)
-                    properties[lang_key] = v
+                    code = normalized_lang_code.code
+                    priority = normalized_lang_code.priority
+                    lang_key = '%s%s' % (candidate, code)
+
+                    if lang_key not in langs or \
+                       priority < langs[lang_key][0].priority:
+                        langs[lang_key] = (normalized_lang_code, v)
+
+    for lang_key, (lang, v) in langs.items():
+        properties[lang_key] = v
 
     for alt_tag_name_candidate in tag_name_alternates:
         alt_tag_name_value = tags.get(alt_tag_name_candidate)