Skip to content

Commit

Permalink
Keep language code even if country code can't be normalised. Add a pr…
Browse files Browse the repository at this point in the history
…iority to the result, so that we can keep the most specific match.
  • Loading branch information
Matt Amos committed Aug 19, 2016
1 parent 681429b commit 4df4863
Show file tree
Hide file tree
Showing 2 changed files with 109 additions and 15 deletions.
74 changes: 72 additions & 2 deletions test/test_transform.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import unittest
from collections import OrderedDict


class BuildingsClassTest(unittest.TestCase):
Expand Down Expand Up @@ -54,6 +55,8 @@ class L10nOsmTransformTest(unittest.TestCase):
def _call_fut(self, x):
from vectordatasource.transform import _convert_osm_l10n_name
result = _convert_osm_l10n_name(x)
if result:
result = result.code
return result

def test_osm_convert_2_3(self):
Expand All @@ -73,8 +76,8 @@ def test_osm_convert_country(self):
self.assertEquals(eng_gb, 'en_GB')

def test_osm_convert_country_invalid(self):
not_found = self._call_fut('en_foo')
self.assertIsNone(not_found)
no_country = self._call_fut('en_foo')
self.assertEquals(no_country, 'en')

def test_osm_convert_lookup(self):
zh_min_nan = self._call_fut('zh-min-nan')
Expand All @@ -88,6 +91,8 @@ class L10nWofTransformTest(unittest.TestCase):
def _call_fut(self, x):
from vectordatasource.transform import _convert_wof_l10n_name
result = _convert_wof_l10n_name(x)
if result:
result = result.code
return result

def test_osm_convert_valid(self):
Expand Down Expand Up @@ -128,6 +133,71 @@ def test_wof_source(self):
self.assertEquals('foo', props['name:en'])


class TagsPriorityI18nTest(unittest.TestCase):

def _call_fut(self, source, kvs):
from vectordatasource.transform import tags_name_i18n
shape = fid = zoom = None

# need to control the order of tags so that we can force the situation
# where one key overwrites another.
tags = OrderedDict()
for k, v in kvs.items():
tags['name:%s' % k] = v

props = dict(
source=source,
tags=tags,
name='unused',
)
result = tags_name_i18n(shape, props, fid, zoom)
return result

def test_wof_no_two_letter_code(self):
# given variants which have no 2-letter code (arq), then we should
# just be left with the ones which do (ara).
shape, props, fid = self._call_fut('whosonfirst.mapzen.com',
{'ara': 'foo', 'arq': 'bar'})
self.assertTrue('name:ar' in props)
self.assertFalse('name:ara' in props)
self.assertFalse('name:arq' in props)
self.assertEquals('foo', props['name:ar'])

def test_osm_invalid_country_code(self):
# given variants with an invalid or unrecognised country code, then
# we should keep any original which had no country code, as it is
# more specific.
langs = OrderedDict([
('en', 'foo'), # The One True Flavour of English.
('en_GB', 'bar'), # Also the correct flavour ;-)
('en_AA', 'baz'), # User-defined country code.
('en_CT', 'bat'), # Currently unassigned/deleted code.
])
shape, props, fid = self._call_fut('openstreetmap.org', langs)

self.assertEquals('foo', props.get('name:en'))
self.assertEquals('bar', props.get('name:en_GB'))
self.assertFalse('name:en_AA' in props)
self.assertFalse('name:en_CT' in props)

def test_osm_invalid_country_code_reverse(self):
# same as the previous test, just checking that when the order of
# the keys is different (we wouldn't normally have control over it
# as it's in a dict), the result is the same.
langs = OrderedDict([
('en_GB', 'bar'),
('en_AA', 'baz'),
('en_CT', 'bat'),
('en', 'foo'),
])
shape, props, fid = self._call_fut('openstreetmap.org', langs)

self.assertEquals('foo', props.get('name:en'))
self.assertEquals('bar', props.get('name:en_GB'))
self.assertFalse('name:en_AA' in props)
self.assertFalse('name:en_CT' in props)


class DropFeaturesMinPixelsTest(unittest.TestCase):

def _make_feature_layers(self, pixel_threshold, shape):
Expand Down
50 changes: 37 additions & 13 deletions vectordatasource/transform.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# transformation functions to apply to features

from collections import defaultdict
from collections import defaultdict, namedtuple
from numbers import Number
from shapely.geometry.collection import GeometryCollection
from shapely.geometry import box as Box
Expand Down Expand Up @@ -341,6 +341,20 @@ def tags_remove(shape, properties, fid, zoom):
)


def _iso639_1_code_of(lang):
try:
iso639_1_code = lang.iso639_1_code.encode('utf-8')
except AttributeError:
return None
return iso639_1_code


# a structure to return language code lookup results preserving the priority
# (lower is better) of the result for use in situations where multiple inputs
# can map to the same output.
LangResult = namedtuple('LangResult', ['code', 'priority'])


def _convert_wof_l10n_name(x):
lang_str_iso_639_3 = x[:3]
if len(lang_str_iso_639_3) != 3:
Expand All @@ -349,7 +363,7 @@ def _convert_wof_l10n_name(x):
lang = pycountry.languages.get(iso639_3_code=lang_str_iso_639_3)
except KeyError:
return None
return lang.iso639_1_code
return LangResult(code=_iso639_1_code_of(lang), priority=0)


def _normalize_osm_lang_code(x):
Expand All @@ -366,11 +380,7 @@ def _normalize_osm_lang_code(x):
lang = pycountry.languages.get(iso639_3_code=x)
except KeyError:
return None
try:
iso639_1_code = lang.iso639_1_code.encode('utf-8')
except AttributeError:
return None
return iso639_1_code
return _iso639_1_code_of(lang)


def _normalize_country_code(x):
Expand All @@ -397,7 +407,7 @@ def _normalize_country_code(x):

def _convert_osm_l10n_name(x):
if x in osm_l10n_lookup:
return x
return LangResult(code=x, priority=0)

if '_' not in x:
lang_code_candidate = x
Expand All @@ -411,17 +421,20 @@ def _convert_osm_l10n_name(x):
if lang_code_result is None:
return None

priority = 0
if country_candidate:
country_result = _normalize_country_code(country_candidate)
if country_result is None:
return None
result = lang_code_result
priority = 1

result = '%s_%s' % (lang_code_result, country_result)
else:
result = '%s_%s' % (lang_code_result, country_result)

else:
result = lang_code_result

return result
return LangResult(code=result, priority=priority)


def tags_name_i18n(shape, properties, fid, zoom):
Expand Down Expand Up @@ -451,16 +464,27 @@ def tags_name_i18n(shape, properties, fid, zoom):
# become available.
return shape, properties, fid

langs = {}
for k, v in tags.items():
if v == name:
continue
for candidate in alt_name_prefix_candidates:

if k.startswith(candidate):
lang_code = k[len(candidate):]
normalized_lang_code = convert_fn(lang_code)

if normalized_lang_code:
lang_key = '%s%s' % (candidate, normalized_lang_code)
properties[lang_key] = v
code = normalized_lang_code.code
priority = normalized_lang_code.priority
lang_key = '%s%s' % (candidate, code)

if lang_key not in langs or \
priority < langs[lang_key][0].priority:
langs[lang_key] = (normalized_lang_code, v)

for lang_key, (lang, v) in langs.items():
properties[lang_key] = v

for alt_tag_name_candidate in tag_name_alternates:
alt_tag_name_value = tags.get(alt_tag_name_candidate)
Expand Down

0 comments on commit 4df4863

Please sign in to comment.