Skip to content

Commit

Permalink
Merge pull request #975 from tilezen/zerebubuth/972-use-2-letter-lang…
Browse files Browse the repository at this point in the history
…uage-codes

Use 2 letter language codes
  • Loading branch information
zerebubuth authored Aug 19, 2016
2 parents 90e5c28 + 4df4863 commit 9f72d0e
Show file tree
Hide file tree
Showing 5 changed files with 157 additions and 60 deletions.
24 changes: 12 additions & 12 deletions SEMANTIC-VERSIONING.md
Original file line number Diff line number Diff line change
Expand Up @@ -132,18 +132,18 @@ In addition to the `common` **name** locals call a place, the following `common`

#### Common languages:

1. `name:ara` Arabic
1. `name:zho` Chinese, traditional or simplified
1. `name:eng` English
1. `name:fra` French
1. `name:rus` Russian
1. `name:spa` Spanish
1. `name:deu` German
1. `name:gre` Greek
1. `name:ita` Italian
1. `name:jpn` Japanese
1. `name:kor` Korean
1. `name:vie` Vietnamese
1. `name:ar` Arabic
1. `name:zh` Chinese, traditional or simplified
1. `name:en` English
1. `name:fr` French
1. `name:ru` Russian
1. `name:es` Spanish
1. `name:de` German
1. `name:gr` Greek
1. `name:it` Italian
1. `name:jp` Japanese
1. `name:ko` Korean
1. `name:vi` Vietnamese

Arabic, Chinese, English, French, Russian and Spanish are used by the United National for meetings and official documents. The other languages listed are frequently used in OpenStreetMap and Who's On First.

Expand Down
2 changes: 1 addition & 1 deletion docs/layers.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ Mapzen primarily sources from OpenStreetMap, but includes a variety of other ope

#### Name localization

Mapzen vector tile features include the default `name` property. We include all language variants of the `name:*`, `alt_name:*`, `alt_name_`, `old_name:*` values to enable full internationalization (when different than `name`). Tangram supports all language scripts.
Mapzen vector tile features include the default `name` property. We include all language variants of the `name:*`, `alt_name:*`, `alt_name_`, `old_name:*` values to enable full internationalization (when different than `name`). Tangram supports all language scripts. Language variants are identified by an ISO 639-1 two-letter language code and optional country, for example `en_GB` for British English.

For features in the `boundaries` layer, we support two additional variants `left:name:*` and `right:name:*` to support oriented labeling on the appropriate side of the boundary line (so the labeled polygon's text can appear inside that polygon consistently).

Expand Down
14 changes: 7 additions & 7 deletions integration-test/418-wof-l10n_name.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
{ 'id': 85826037, 'kind': 'neighbourhood',
'source': "whosonfirst.mapzen.com",
'name': 'Hollywood',
'name:kor': '\xed\x97\x90\xeb\xa6\xac\xec\x9a\xb0\xeb\x93\x9c' })
'name:ko': '\xed\x97\x90\xeb\xa6\xac\xec\x9a\xb0\xeb\x93\x9c' })

# San Francisco (wof neighbourhood)
# https://whosonfirst.mapzen.com/data/858/826/41/85882641.geojson
Expand All @@ -14,7 +14,7 @@
{ 'id': 85882641, 'kind': 'neighbourhood',
'source': "whosonfirst.mapzen.com",
'name': 'San Francisco',
'name:spa': type(None) })
'name:es': type(None) })

# San Francisco (osm city)
# http://www.openstreetmap.org/node/26819236
Expand All @@ -23,20 +23,20 @@
{ 'id': 26819236, 'kind': 'locality', 'kind_detail': 'city',
'source': "openstreetmap.org",
'name': 'San Francisco',
'name:zho': '\xe8\x88\x8a\xe9\x87\x91\xe5\xb1\xb1\xe5\xb8\x82\xe8\x88\x87\xe7\xb8\xa3' })
'name:zh': '\xe8\x88\x8a\xe9\x87\x91\xe5\xb1\xb1\xe5\xb8\x82\xe8\x88\x87\xe7\xb8\xa3' })

# Node: Londonderry/Derry (267762522)
# http://www.openstreetmap.org/node/267762522
assert_has_feature(
16, 31436, 20731, 'places',
{ 'id': 267762522, 'name:eng_GB': 'Londonderry'})
{ 'id': 267762522, 'name:en_GB': 'Londonderry'})

# Node: Jerusalem (29090735)
# http://www.openstreetmap.org/node/29090735
assert_has_feature(
16, 39180, 26661, 'places',
{ 'id': 29090735,
'name:nan': 'I\xc3\xa2-l\xc5\x8d\xcd\x98-sat-l\xc3\xa9ng',
'name:zho': '\xe8\x80\xb6\xe8\xb7\xaf\xe6\x92\x92\xe5\x86\xb7',
'name:yue': '\xe8\x80\xb6\xe8\xb7\xaf\xe6\x92\x92\xe5\x86\xb7',
'name:zh-min-nan': 'I\xc3\xa2-l\xc5\x8d\xcd\x98-sat-l\xc3\xa9ng',
'name:zh': '\xe8\x80\xb6\xe8\xb7\xaf\xe6\x92\x92\xe5\x86\xb7',
'name:zh-yue': '\xe8\x80\xb6\xe8\xb7\xaf\xe6\x92\x92\xe5\x86\xb7',
})
94 changes: 82 additions & 12 deletions test/test_transform.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import unittest
from collections import OrderedDict


class BuildingsClassTest(unittest.TestCase):
Expand Down Expand Up @@ -54,45 +55,49 @@ class L10nOsmTransformTest(unittest.TestCase):
def _call_fut(self, x):
from vectordatasource.transform import _convert_osm_l10n_name
result = _convert_osm_l10n_name(x)
if result:
result = result.code
return result

def test_osm_convert_2_3(self):
eng = self._call_fut('en')
self.assertEquals(eng, 'eng')
self.assertEquals(eng, 'en')

def test_osm_convert_3(self):
eng = self._call_fut('eng')
self.assertEquals(eng, 'eng')
self.assertEquals(eng, 'en')

def test_osm_convert_not_found(self):
invalid = self._call_fut('foo')
self.assertIsNone(invalid)

def test_osm_convert_country(self):
eng_gb = self._call_fut('en_GB')
self.assertEquals(eng_gb, 'eng_GB')
self.assertEquals(eng_gb, 'en_GB')

def test_osm_convert_country_invalid(self):
not_found = self._call_fut('en_foo')
self.assertIsNone(not_found)
no_country = self._call_fut('en_foo')
self.assertEquals(no_country, 'en')

def test_osm_convert_lookup(self):
zh_min_nan = self._call_fut('zh-min-nan')
self.assertEquals(zh_min_nan, 'nan')
self.assertEquals(zh_min_nan, 'zh-min-nan')
zh_min_nan = self._call_fut('zh-yue')
self.assertEquals(zh_min_nan, 'yue')
self.assertEquals(zh_min_nan, 'zh-yue')


class L10nWofTransformTest(unittest.TestCase):

def _call_fut(self, x):
from vectordatasource.transform import _convert_wof_l10n_name
result = _convert_wof_l10n_name(x)
if result:
result = result.code
return result

def test_osm_convert_valid(self):
eng = self._call_fut('eng_x')
self.assertEquals(eng, 'eng')
self.assertEquals(eng, 'en')

def test_osm_convert_invalid(self):
invalid = self._call_fut('zzz_x')
Expand All @@ -118,14 +123,79 @@ def _call_fut(self, source, name_key, name_val):

def test_osm_source(self):
shape, props, fid = self._call_fut('openstreetmap.org', 'en', 'foo')
self.assertTrue('name:eng' in props)
self.assertEquals('foo', props['name:eng'])
self.assertTrue('name:en' in props)
self.assertEquals('foo', props['name:en'])

def test_wof_source(self):
shape, props, fid = self._call_fut('whosonfirst.mapzen.com',
'eng_x', 'foo')
self.assertTrue('name:eng' in props)
self.assertEquals('foo', props['name:eng'])
self.assertTrue('name:en' in props)
self.assertEquals('foo', props['name:en'])


class TagsPriorityI18nTest(unittest.TestCase):

def _call_fut(self, source, kvs):
from vectordatasource.transform import tags_name_i18n
shape = fid = zoom = None

# need to control the order of tags so that we can force the situation
# where one key overwrites another.
tags = OrderedDict()
for k, v in kvs.items():
tags['name:%s' % k] = v

props = dict(
source=source,
tags=tags,
name='unused',
)
result = tags_name_i18n(shape, props, fid, zoom)
return result

def test_wof_no_two_letter_code(self):
# given variants which have no 2-letter code (arq), then we should
# just be left with the ones which do (ara).
shape, props, fid = self._call_fut('whosonfirst.mapzen.com',
{'ara': 'foo', 'arq': 'bar'})
self.assertTrue('name:ar' in props)
self.assertFalse('name:ara' in props)
self.assertFalse('name:arq' in props)
self.assertEquals('foo', props['name:ar'])

def test_osm_invalid_country_code(self):
# given variants with an invalid or unrecognised country code, then
# we should keep any original which had no country code, as it is
# more specific.
langs = OrderedDict([
('en', 'foo'), # The One True Flavour of English.
('en_GB', 'bar'), # Also the correct flavour ;-)
('en_AA', 'baz'), # User-defined country code.
('en_CT', 'bat'), # Currently unassigned/deleted code.
])
shape, props, fid = self._call_fut('openstreetmap.org', langs)

self.assertEquals('foo', props.get('name:en'))
self.assertEquals('bar', props.get('name:en_GB'))
self.assertFalse('name:en_AA' in props)
self.assertFalse('name:en_CT' in props)

def test_osm_invalid_country_code_reverse(self):
# same as the previous test, just checking that when the order of
# the keys is different (we wouldn't normally have control over it
# as it's in a dict), the result is the same.
langs = OrderedDict([
('en_GB', 'bar'),
('en_AA', 'baz'),
('en_CT', 'bat'),
('en', 'foo'),
])
shape, props, fid = self._call_fut('openstreetmap.org', langs)

self.assertEquals('foo', props.get('name:en'))
self.assertEquals('bar', props.get('name:en_GB'))
self.assertFalse('name:en_AA' in props)
self.assertFalse('name:en_CT' in props)


class DropFeaturesMinPixelsTest(unittest.TestCase):
Expand Down
83 changes: 55 additions & 28 deletions vectordatasource/transform.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# transformation functions to apply to features

from collections import defaultdict
from collections import defaultdict, namedtuple
from numbers import Number
from shapely.geometry.collection import GeometryCollection
from shapely.geometry import box as Box
Expand Down Expand Up @@ -341,15 +341,29 @@ def tags_remove(shape, properties, fid, zoom):
)


def _iso639_1_code_of(lang):
try:
iso639_1_code = lang.iso639_1_code.encode('utf-8')
except AttributeError:
return None
return iso639_1_code


# a structure to return language code lookup results preserving the priority
# (lower is better) of the result for use in situations where multiple inputs
# can map to the same output.
LangResult = namedtuple('LangResult', ['code', 'priority'])


def _convert_wof_l10n_name(x):
lang_str_iso_639_3 = x[:3]
if len(lang_str_iso_639_3) != 3:
return None
try:
pycountry.languages.get(iso639_3_code=lang_str_iso_639_3)
lang = pycountry.languages.get(iso639_3_code=lang_str_iso_639_3)
except KeyError:
return None
return lang_str_iso_639_3
return LangResult(code=_iso639_1_code_of(lang), priority=0)


def _normalize_osm_lang_code(x):
Expand All @@ -366,8 +380,7 @@ def _normalize_osm_lang_code(x):
lang = pycountry.languages.get(iso639_3_code=x)
except KeyError:
return None
iso639_3_code = lang.iso639_3_code.encode('utf-8')
return iso639_3_code
return _iso639_1_code_of(lang)


def _normalize_country_code(x):
Expand All @@ -386,39 +399,42 @@ def _normalize_country_code(x):
return alpha2_code


osm_l10n_lookup = {
'zh-min-nan': 'nan',
'zh-yue': 'yue',
}


def osm_l10n_name_lookup(x):
lookup = osm_l10n_lookup.get(x)
if lookup is not None:
return lookup
else:
return x
osm_l10n_lookup = set([
'zh-min-nan',
'zh-yue'
])


def _convert_osm_l10n_name(x):
x = osm_l10n_name_lookup(x)
if x in osm_l10n_lookup:
return LangResult(code=x, priority=0)

if '_' not in x:
return _normalize_osm_lang_code(x)
lang_code_candidate = x
country_candidate = None

fields_by_underscore = x.split('_', 1)
lang_code_candidate, country_candidate = fields_by_underscore
else:
fields_by_underscore = x.split('_', 1)
lang_code_candidate, country_candidate = fields_by_underscore

lang_code_result = _normalize_osm_lang_code(lang_code_candidate)
if lang_code_result is None:
return None

country_result = _normalize_country_code(country_candidate)
if country_result is None:
return None
priority = 0
if country_candidate:
country_result = _normalize_country_code(country_candidate)
if country_result is None:
result = lang_code_result
priority = 1

result = '%s_%s' % (lang_code_result, country_result)
return result
else:
result = '%s_%s' % (lang_code_result, country_result)

else:
result = lang_code_result

return LangResult(code=result, priority=priority)


def tags_name_i18n(shape, properties, fid, zoom):
Expand Down Expand Up @@ -448,16 +464,27 @@ def tags_name_i18n(shape, properties, fid, zoom):
# become available.
return shape, properties, fid

langs = {}
for k, v in tags.items():
if v == name:
continue
for candidate in alt_name_prefix_candidates:

if k.startswith(candidate):
lang_code = k[len(candidate):]
normalized_lang_code = convert_fn(lang_code)

if normalized_lang_code:
lang_key = '%s%s' % (candidate, normalized_lang_code)
properties[lang_key] = v
code = normalized_lang_code.code
priority = normalized_lang_code.priority
lang_key = '%s%s' % (candidate, code)

if lang_key not in langs or \
priority < langs[lang_key][0].priority:
langs[lang_key] = (normalized_lang_code, v)

for lang_key, (lang, v) in langs.items():
properties[lang_key] = v

for alt_tag_name_candidate in tag_name_alternates:
alt_tag_name_value = tags.get(alt_tag_name_candidate)
Expand Down

0 comments on commit 9f72d0e

Please sign in to comment.