Merge pull request #975 from tilezen/zerebubuth/972-use-2-letter-lang…

…uage-codes Use 2 letter language codes
tilezen · Aug 19, 2016 · 9f72d0e · 9f72d0e
2 parents 90e5c28 + 4df4863
commit 9f72d0e
Show file tree

Hide file tree

Showing 5 changed files with 157 additions and 60 deletions.
diff --git a/SEMANTIC-VERSIONING.md b/SEMANTIC-VERSIONING.md
@@ -132,18 +132,18 @@ In addition to the `common` **name** locals call a place, the following `common`
 
 #### Common languages:
 
-1. `name:ara` Arabic
-1. `name:zho` Chinese, traditional or simplified
-1. `name:eng` English
-1. `name:fra` French
-1. `name:rus` Russian
-1. `name:spa` Spanish
-1. `name:deu` German
-1. `name:gre` Greek
-1. `name:ita` Italian
-1. `name:jpn` Japanese
-1. `name:kor` Korean
-1. `name:vie` Vietnamese
+1. `name:ar` Arabic
+1. `name:zh` Chinese, traditional or simplified
+1. `name:en` English
+1. `name:fr` French
+1. `name:ru` Russian
+1. `name:es` Spanish
+1. `name:de` German
+1. `name:gr` Greek
+1. `name:it` Italian
+1. `name:jp` Japanese
+1. `name:ko` Korean
+1. `name:vi` Vietnamese
 
 Arabic, Chinese, English, French, Russian and Spanish are used by the United National for meetings and official documents. The other languages listed are frequently used in OpenStreetMap and Who's On First.
 

diff --git a/docs/layers.md b/docs/layers.md
@@ -16,7 +16,7 @@ Mapzen primarily sources from OpenStreetMap, but includes a variety of other ope
 
 #### Name localization
 
-Mapzen vector tile features include the default `name` property. We include all language variants of the `name:*`, `alt_name:*`, `alt_name_`, `old_name:*` values to enable full internationalization (when different than `name`). Tangram supports all language scripts.
+Mapzen vector tile features include the default `name` property. We include all language variants of the `name:*`, `alt_name:*`, `alt_name_`, `old_name:*` values to enable full internationalization (when different than `name`). Tangram supports all language scripts. Language variants are identified by an ISO 639-1 two-letter language code and optional country, for example `en_GB` for British English.
 
 For features in the `boundaries` layer, we support two additional variants `left:name:*` and `right:name:*` to support oriented labeling on the appropriate side of the boundary line (so the labeled polygon's text can appear inside that polygon consistently).
 

diff --git a/integration-test/418-wof-l10n_name.py b/integration-test/418-wof-l10n_name.py
@@ -5,7 +5,7 @@
     { 'id': 85826037, 'kind': 'neighbourhood',
       'source': "whosonfirst.mapzen.com",
       'name': 'Hollywood',
-      'name:kor': '\xed\x97\x90\xeb\xa6\xac\xec\x9a\xb0\xeb\x93\x9c' })
+      'name:ko': '\xed\x97\x90\xeb\xa6\xac\xec\x9a\xb0\xeb\x93\x9c' })
 
 # San Francisco (wof neighbourhood)
 # https://whosonfirst.mapzen.com/data/858/826/41/85882641.geojson
@@ -14,7 +14,7 @@
     { 'id': 85882641, 'kind': 'neighbourhood',
       'source': "whosonfirst.mapzen.com",
       'name': 'San Francisco',
-      'name:spa': type(None) })
+      'name:es': type(None) })
 
 # San Francisco (osm city)
 # http://www.openstreetmap.org/node/26819236
@@ -23,20 +23,20 @@
     { 'id': 26819236, 'kind': 'locality', 'kind_detail': 'city',
       'source': "openstreetmap.org",
       'name': 'San Francisco',
-      'name:zho': '\xe8\x88\x8a\xe9\x87\x91\xe5\xb1\xb1\xe5\xb8\x82\xe8\x88\x87\xe7\xb8\xa3' })
+      'name:zh': '\xe8\x88\x8a\xe9\x87\x91\xe5\xb1\xb1\xe5\xb8\x82\xe8\x88\x87\xe7\xb8\xa3' })
 
 # Node: Londonderry/Derry (267762522)
 # http://www.openstreetmap.org/node/267762522
 assert_has_feature(
     16, 31436, 20731, 'places',
-    { 'id': 267762522, 'name:eng_GB': 'Londonderry'})
+    { 'id': 267762522, 'name:en_GB': 'Londonderry'})
 
 # Node: Jerusalem (29090735)
 # http://www.openstreetmap.org/node/29090735
 assert_has_feature(
     16, 39180, 26661, 'places',
     { 'id': 29090735,
-      'name:nan': 'I\xc3\xa2-l\xc5\x8d\xcd\x98-sat-l\xc3\xa9ng',
-      'name:zho': '\xe8\x80\xb6\xe8\xb7\xaf\xe6\x92\x92\xe5\x86\xb7',
-      'name:yue': '\xe8\x80\xb6\xe8\xb7\xaf\xe6\x92\x92\xe5\x86\xb7',
+      'name:zh-min-nan': 'I\xc3\xa2-l\xc5\x8d\xcd\x98-sat-l\xc3\xa9ng',
+      'name:zh': '\xe8\x80\xb6\xe8\xb7\xaf\xe6\x92\x92\xe5\x86\xb7',
+      'name:zh-yue': '\xe8\x80\xb6\xe8\xb7\xaf\xe6\x92\x92\xe5\x86\xb7',
       })
diff --git a/test/test_transform.py b/test/test_transform.py
@@ -1,4 +1,5 @@
 import unittest
+from collections import OrderedDict
 
 
 class BuildingsClassTest(unittest.TestCase):
@@ -54,45 +55,49 @@ class L10nOsmTransformTest(unittest.TestCase):
     def _call_fut(self, x):
         from vectordatasource.transform import _convert_osm_l10n_name
         result = _convert_osm_l10n_name(x)
+        if result:
+            result = result.code
         return result
 
     def test_osm_convert_2_3(self):
         eng = self._call_fut('en')
-        self.assertEquals(eng, 'eng')
+        self.assertEquals(eng, 'en')
 
     def test_osm_convert_3(self):
         eng = self._call_fut('eng')
-        self.assertEquals(eng, 'eng')
+        self.assertEquals(eng, 'en')
 
     def test_osm_convert_not_found(self):
         invalid = self._call_fut('foo')
         self.assertIsNone(invalid)
 
     def test_osm_convert_country(self):
         eng_gb = self._call_fut('en_GB')
-        self.assertEquals(eng_gb, 'eng_GB')
+        self.assertEquals(eng_gb, 'en_GB')
 
     def test_osm_convert_country_invalid(self):
-        not_found = self._call_fut('en_foo')
-        self.assertIsNone(not_found)
+        no_country = self._call_fut('en_foo')
+        self.assertEquals(no_country, 'en')
 
     def test_osm_convert_lookup(self):
         zh_min_nan = self._call_fut('zh-min-nan')
-        self.assertEquals(zh_min_nan, 'nan')
+        self.assertEquals(zh_min_nan, 'zh-min-nan')
         zh_min_nan = self._call_fut('zh-yue')
-        self.assertEquals(zh_min_nan, 'yue')
+        self.assertEquals(zh_min_nan, 'zh-yue')
 
 
 class L10nWofTransformTest(unittest.TestCase):
 
     def _call_fut(self, x):
         from vectordatasource.transform import _convert_wof_l10n_name
         result = _convert_wof_l10n_name(x)
+        if result:
+            result = result.code
         return result
 
     def test_osm_convert_valid(self):
         eng = self._call_fut('eng_x')
-        self.assertEquals(eng, 'eng')
+        self.assertEquals(eng, 'en')
 
     def test_osm_convert_invalid(self):
         invalid = self._call_fut('zzz_x')
@@ -118,14 +123,79 @@ def _call_fut(self, source, name_key, name_val):
 
     def test_osm_source(self):
         shape, props, fid = self._call_fut('openstreetmap.org', 'en', 'foo')
-        self.assertTrue('name:eng' in props)
-        self.assertEquals('foo', props['name:eng'])
+        self.assertTrue('name:en' in props)
+        self.assertEquals('foo', props['name:en'])
 
     def test_wof_source(self):
         shape, props, fid = self._call_fut('whosonfirst.mapzen.com',
                                            'eng_x', 'foo')
-        self.assertTrue('name:eng' in props)
-        self.assertEquals('foo', props['name:eng'])
+        self.assertTrue('name:en' in props)
+        self.assertEquals('foo', props['name:en'])
+
+
+class TagsPriorityI18nTest(unittest.TestCase):
+
+    def _call_fut(self, source, kvs):
+        from vectordatasource.transform import tags_name_i18n
+        shape = fid = zoom = None
+
+        # need to control the order of tags so that we can force the situation
+        # where one key overwrites another.
+        tags = OrderedDict()
+        for k, v in kvs.items():
+            tags['name:%s' % k] = v
+
+        props = dict(
+            source=source,
+            tags=tags,
+            name='unused',
+        )
+        result = tags_name_i18n(shape, props, fid, zoom)
+        return result
+
+    def test_wof_no_two_letter_code(self):
+        # given variants which have no 2-letter code (arq), then we should
+        # just be left with the ones which do (ara).
+        shape, props, fid = self._call_fut('whosonfirst.mapzen.com',
+                                           {'ara': 'foo', 'arq': 'bar'})
+        self.assertTrue('name:ar' in props)
+        self.assertFalse('name:ara' in props)
+        self.assertFalse('name:arq' in props)
+        self.assertEquals('foo', props['name:ar'])
+
+    def test_osm_invalid_country_code(self):
+        # given variants with an invalid or unrecognised country code, then
+        # we should keep any original which had no country code, as it is
+        # more specific.
+        langs = OrderedDict([
+            ('en',    'foo'),  # The One True Flavour of English.
+            ('en_GB', 'bar'),  # Also the correct flavour ;-)
+            ('en_AA', 'baz'),  # User-defined country code.
+            ('en_CT', 'bat'),  # Currently unassigned/deleted code.
+        ])
+        shape, props, fid = self._call_fut('openstreetmap.org', langs)
+
+        self.assertEquals('foo', props.get('name:en'))
+        self.assertEquals('bar', props.get('name:en_GB'))
+        self.assertFalse('name:en_AA' in props)
+        self.assertFalse('name:en_CT' in props)
+
+    def test_osm_invalid_country_code_reverse(self):
+        # same as the previous test, just checking that when the order of
+        # the keys is different (we wouldn't normally have control over it
+        # as it's in a dict), the result is the same.
+        langs = OrderedDict([
+            ('en_GB', 'bar'),
+            ('en_AA', 'baz'),
+            ('en_CT', 'bat'),
+            ('en',    'foo'),
+        ])
+        shape, props, fid = self._call_fut('openstreetmap.org', langs)
+
+        self.assertEquals('foo', props.get('name:en'))
+        self.assertEquals('bar', props.get('name:en_GB'))
+        self.assertFalse('name:en_AA' in props)
+        self.assertFalse('name:en_CT' in props)
 
 
 class DropFeaturesMinPixelsTest(unittest.TestCase):

diff --git a/vectordatasource/transform.py b/vectordatasource/transform.py
@@ -1,6 +1,6 @@
 # transformation functions to apply to features
 
-from collections import defaultdict
+from collections import defaultdict, namedtuple
 from numbers import Number
 from shapely.geometry.collection import GeometryCollection
 from shapely.geometry import box as Box
@@ -341,15 +341,29 @@ def tags_remove(shape, properties, fid, zoom):
 )
 
 
+def _iso639_1_code_of(lang):
+    try:
+        iso639_1_code = lang.iso639_1_code.encode('utf-8')
+    except AttributeError:
+        return None
+    return iso639_1_code
+
+
+# a structure to return language code lookup results preserving the priority
+# (lower is better) of the result for use in situations where multiple inputs
+# can map to the same output.
+LangResult = namedtuple('LangResult', ['code', 'priority'])
+
+
 def _convert_wof_l10n_name(x):
     lang_str_iso_639_3 = x[:3]
     if len(lang_str_iso_639_3) != 3:
         return None
     try:
-        pycountry.languages.get(iso639_3_code=lang_str_iso_639_3)
+        lang = pycountry.languages.get(iso639_3_code=lang_str_iso_639_3)
     except KeyError:
         return None
-    return lang_str_iso_639_3
+    return LangResult(code=_iso639_1_code_of(lang), priority=0)
 
 
 def _normalize_osm_lang_code(x):
@@ -366,8 +380,7 @@ def _normalize_osm_lang_code(x):
                 lang = pycountry.languages.get(iso639_3_code=x)
             except KeyError:
                 return None
-    iso639_3_code = lang.iso639_3_code.encode('utf-8')
-    return iso639_3_code
+    return _iso639_1_code_of(lang)
 
 
 def _normalize_country_code(x):
@@ -386,39 +399,42 @@ def _normalize_country_code(x):
     return alpha2_code
 
 
-osm_l10n_lookup = {
-    'zh-min-nan': 'nan',
-    'zh-yue': 'yue',
-}
-
-
-def osm_l10n_name_lookup(x):
-    lookup = osm_l10n_lookup.get(x)
-    if lookup is not None:
-        return lookup
-    else:
-        return x
+osm_l10n_lookup = set([
+    'zh-min-nan',
+    'zh-yue'
+])
 
 
 def _convert_osm_l10n_name(x):
-    x = osm_l10n_name_lookup(x)
+    if x in osm_l10n_lookup:
+        return LangResult(code=x, priority=0)
 
     if '_' not in x:
-        return _normalize_osm_lang_code(x)
+        lang_code_candidate = x
+        country_candidate = None
 
-    fields_by_underscore = x.split('_', 1)
-    lang_code_candidate, country_candidate = fields_by_underscore
+    else:
+        fields_by_underscore = x.split('_', 1)
+        lang_code_candidate, country_candidate = fields_by_underscore
 
     lang_code_result = _normalize_osm_lang_code(lang_code_candidate)
     if lang_code_result is None:
         return None
 
-    country_result = _normalize_country_code(country_candidate)
-    if country_result is None:
-        return None
+    priority = 0
+    if country_candidate:
+        country_result = _normalize_country_code(country_candidate)
+        if country_result is None:
+            result = lang_code_result
+            priority = 1
 
-    result = '%s_%s' % (lang_code_result, country_result)
-    return result
+        else:
+            result = '%s_%s' % (lang_code_result, country_result)
+
+    else:
+        result = lang_code_result
+
+    return LangResult(code=result, priority=priority)
 
 
 def tags_name_i18n(shape, properties, fid, zoom):
@@ -448,16 +464,27 @@ def tags_name_i18n(shape, properties, fid, zoom):
         # become available.
         return shape, properties, fid
 
+    langs = {}
     for k, v in tags.items():
         if v == name:
             continue
         for candidate in alt_name_prefix_candidates:
+
             if k.startswith(candidate):
                 lang_code = k[len(candidate):]
                 normalized_lang_code = convert_fn(lang_code)
+
                 if normalized_lang_code:
-                    lang_key = '%s%s' % (candidate, normalized_lang_code)
-                    properties[lang_key] = v
+                    code = normalized_lang_code.code
+                    priority = normalized_lang_code.priority
+                    lang_key = '%s%s' % (candidate, code)
+
+                    if lang_key not in langs or \
+                       priority < langs[lang_key][0].priority:
+                        langs[lang_key] = (normalized_lang_code, v)
+
+    for lang_key, (lang, v) in langs.items():
+        properties[lang_key] = v
 
     for alt_tag_name_candidate in tag_name_alternates:
         alt_tag_name_value = tags.get(alt_tag_name_candidate)