Skip to content

Commit

Permalink
Merge pull request #314 from mabel-dev/v0.2.0
Browse files Browse the repository at this point in the history
V0.2.0
  • Loading branch information
joocer authored Jul 30, 2022
2 parents 02da1f2 + 0202b06 commit 0061e52
Show file tree
Hide file tree
Showing 4 changed files with 27 additions and 22 deletions.
1 change: 1 addition & 0 deletions opteryx/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

from pathlib import Path

__author__: str = "@joocer"

apilevel = "1.0" # pylint: disable=C0103
threadsafety = 0 # pylint: disable=C0103
Expand Down
2 changes: 1 addition & 1 deletion opteryx/engine/planner/operations/blob_reader_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@
from opteryx.engine.planner.operations import BasePlanNode
from opteryx.exceptions import DatabaseError
from opteryx.storage import file_decoders
from opteryx.storage.adapters import DiskStorage
from opteryx.storage.schemes import MabelPartitionScheme
from opteryx.storage.schemes import DefaultPartitionScheme
from opteryx.utils.columns import Columns
Expand Down Expand Up @@ -93,6 +92,7 @@ def _normalize_to_schema(table, schema):
my_schema = table.schema.set(
index, pyarrow.field(column, first_types[column])
)

table = table.cast(target_schema=my_schema)

return table, schema
Expand Down
44 changes: 24 additions & 20 deletions opteryx/engine/planner/operations/show_columns.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
Gives information about a dataset's columns
"""
import datetime

from functools import reduce
from typing import Iterable
from numpy import nan, nanmin, nanmax
Expand All @@ -32,19 +34,29 @@
from opteryx.utils.columns import Columns

MAX_COLLECTOR: int = 17
MAX_VARCHAR_SIZE: int = 64 # long strings tend to lose meaning
MAX_DATA_SIZE: int = 100 * 1024 * 1024


def _to_linux_epoch(date):
if date.as_py() is None:
return numpy.nan
return datetime.datetime.fromisoformat(date.as_py().isoformat()).timestamp()


def myhash(any):
def myhash(anything):
from cityhash import CityHash64

if isinstance(any, list):
hashed = map(myhash, any)
if isinstance(anything, list):
hashed = map(myhash, anything)
return reduce(lambda x, y: x ^ y, hashed, 0)
if isinstance(any, dict):
return CityHash64("".join([f"{k}:{any[k]}" for k in sorted(any.keys())]))
if isinstance(any, bool):
return int(any)
return CityHash64(str(any))
if isinstance(anything, dict):
return CityHash64(
"".join([f"{k}:{anything[k]}" for k in sorted(anything.keys())])
)
if isinstance(anything, bool):
return int(anything)
return CityHash64(str(anything))


def _simple_collector(page):
Expand Down Expand Up @@ -214,7 +226,7 @@ def _extended_collector(pages):
continue

# to prevent problems, we set some limits
if column_data.nbytes > 100 * 1024 * 1024:
if column_data.nbytes > MAX_DATA_SIZE:
if column not in uncollected_columns:
uncollected_columns.append(column)
continue
Expand All @@ -234,23 +246,14 @@ def _extended_collector(pages):
(v.as_py() for v in column_data if v.is_valid),
0,
)
if max_len > 32:
if max_len > MAX_VARCHAR_SIZE:
if column not in uncollected_columns:
uncollected_columns.append(column)
continue

# convert TIMESTAMP into a NUMERIC (seconds after Linux Epoch)
if _type == OPTERYX_TYPES.TIMESTAMP:
import datetime

to_linux_epoch = (
lambda x: numpy.nan
if x.as_py() is None
else datetime.datetime.fromisoformat(
x.as_py().isoformat()
).timestamp()
)
column_data = (to_linux_epoch(i) for i in column_data)
column_data = (_to_linux_epoch(i) for i in column_data)
else:
column_data = (i.as_py() for i in column_data)

Expand Down Expand Up @@ -310,6 +313,7 @@ def _extended_collector(pages):
profile["type"] = ", ".join(profile["type"])

if column not in uncollected_columns:

dgram = profile.pop("distogram", None)
if dgram:
profile["min"], profile["max"] = distogram.bounds(dgram)
Expand Down
2 changes: 1 addition & 1 deletion opteryx/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,4 @@
2) we can import it in setup.py for the same reason
"""

__version__ = "0.2.0-beta.7"
__version__ = "0.2.0"

0 comments on commit 0061e52

Please sign in to comment.