From 3059db63997eb7900ae99bf7f3ae25f6ed9d4a46 Mon Sep 17 00:00:00 2001 From: VimT Date: Fri, 4 Oct 2024 23:26:59 +0800 Subject: [PATCH] Fix/14 caching issues using dictionary (#15) fix: caching point issue (#14) --- README.md | 51 +++++++++++++++++++++++++++++++++----------------- mmdb_writer.py | 26 ++++++++++--------------- 2 files changed, 44 insertions(+), 33 deletions(-) diff --git a/README.md b/README.md index d652ead..5089cb2 100644 --- a/README.md +++ b/README.md @@ -10,14 +10,16 @@ # MaxMind-DB-Writer-python -Make `mmdb` format ip library file which can be read by [`maxmind` official language reader](https://dev.maxmind.com/geoip/geoip2/downloadable/) +Make `mmdb` format ip library file which can be read by [ +`maxmind` official language reader](https://dev.maxmind.com/geoip/geoip2/downloadable/) -~~[The official perl writer](https://github.com/maxmind/MaxMind-DB-Writer-perl) was written in perl, -which was difficult to customize. +~~[The official perl writer](https://github.com/maxmind/MaxMind-DB-Writer-perl) was written in perl, +which was difficult to customize. So I implemented the `MaxmindDB format` ip library in python language.~~ -MaxMind has now released an official Go version of the MMDB writer. -If you prefer using Go, you can check out the official Go implementation [mmdbwriter](https://github.com/maxmind/mmdbwriter). +MaxMind has now released an official Go version of the MMDB writer. +If you prefer using Go, you can check out the official Go +implementation [mmdbwriter](https://github.com/maxmind/mmdbwriter). This project still provides a Python alternative for those who need it. ## Install @@ -27,30 +29,35 @@ pip install -U mmdb_writer ``` ## Usage + ```python from netaddr import IPSet from mmdb_writer import MMDBWriter + writer = MMDBWriter() writer.insert_network(IPSet(['1.1.0.0/24', '1.1.1.0/24']), {'country': 'COUNTRY', 'isp': 'ISP'}) writer.to_db_file('test.mmdb') import maxminddb + m = maxminddb.open_database('test.mmdb') r = m.get('1.1.1.1') assert r == {'country': 'COUNTRY', 'isp': 'ISP'} ``` ## Examples + see [csv_to_mmdb.py](./examples/csv_to_mmdb.py) Here is a professional and clear translation of the README.md section from Chinese into English: ## Using the Java Client -### TLDR +If you are using the Java client, you need to be careful to set the `int_type` parameter so that Java correctly +recognizes the integer type in the MMDB file. -When generating an MMDB file for use with the Java client, you must specify the `int_type`: +Example: ```python from mmdb_writer import MMDBWriter @@ -65,15 +72,15 @@ Alternatively, you can explicitly specify data types using the [Type Enforcement In Java, when deserializing to a structure, the numeric types will use the original MMDB numeric types. The specific conversion relationships are as follows: -| mmdb type | java type | -|--------------|------------| -| float (15) | Float | -| double (3) | Double | -| int32 (8) | Integer | -| uint16 (5) | Integer | -| uint32 (6) | Long | -| uint64 (9) | BigInteger | -| uint128 (10) | BigInteger | +| mmdb type | java type | +|-----------|------------| +| float | Float | +| double | Double | +| int32 | Integer | +| uint16 | Integer | +| uint32 | Long | +| uint64 | BigInteger | +| uint128 | BigInteger | When using the Python writer to generate an MMDB file, by default, it converts integers to the corresponding MMDB type based on the size of the `int`. For instance, `int(1)` would convert to `uint16`, and `int(2**16+1)` would convert @@ -97,7 +104,17 @@ MMDB file. The behaviors for different `int_type` settings are: | u64 | Stores all integer types as `uint64`. | | u128 | Stores all integer types as `uint128`. | +If you want to use different int types for different scenarios, you can use type wrapping: + +```python +from mmdb_writer import MMDBWriter, MmdbI32, MmdbF32 + +writer = MMDBWriter() +# the value of field "i32" will be stored as int32 type +writer.insert_network(IPSet(["1.0.0.0/24"]), {"i32": MmdbI32(128), "f32": MmdbF32(1.22)}) +``` + +## Reference: -## Reference: - [MaxmindDB format](http://maxmind.github.io/MaxMind-DB/) - [geoip-mmdb](https://github.com/i-rinat/geoip-mmdb) diff --git a/mmdb_writer.py b/mmdb_writer.py index e26de1f..7b984a6 100644 --- a/mmdb_writer.py +++ b/mmdb_writer.py @@ -378,11 +378,12 @@ def encode_meta(self, meta): res += self.encode(v, meta_type.get(k)) return res - def encode(self, value, type_id=None): + def encode(self, value, type_id=None, return_offset=False): if self.cache: cache_key = self._freeze(value) try: - return self.data_cache[cache_key] + offset = self.data_cache[cache_key] + return offset if return_offset else self._encode_pointer(offset) except KeyError: pass @@ -399,18 +400,11 @@ def encode(self, value, type_id=None): res = encoder(value) if self.cache: - # add to cache - if type_id == 1: - self.data_list.append(res) - self.data_pointer += len(res) - return res - else: - self.data_list.append(res) - pointer_position = self.data_pointer - self.data_pointer += len(res) - pointer = self.encode(pointer_position, 1) - self.data_cache[cache_key] = pointer - return pointer + self.data_list.append(res) + offset = self.data_pointer + self.data_pointer += len(res) + self.data_cache[cache_key] = offset + return offset if return_offset else self._encode_pointer(offset) return res @@ -484,8 +478,8 @@ def _enumerate_nodes(self, node): elif type(node) is SearchTreeLeaf: node_id = id(node) if node_id not in self._leaf_offset: - res = self.encoder.encode(node.value) - self._leaf_offset[node_id] = self._data_pointer - len(res) + offset = self.encoder.encode(node.value, return_offset=True) + self._leaf_offset[node_id] = offset + 16 else: # == None return