Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(hogql): properties between shards #14549

Merged
merged 18 commits into from
Mar 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 37 additions & 7 deletions posthog/api/test/__snapshots__/test_query.ambr
Original file line number Diff line number Diff line change
Expand Up @@ -237,9 +237,38 @@
/* user_id:0 request:_snapshot_ */
SELECT event,
events.distinct_id,
replaceRegexpAll(JSONExtractRaw(events.properties, 'key'), '^"|"$', ''),
replaceRegexpAll(JSONExtractRaw(properties, 'key'), '^"|"$', ''),
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Previously events. was added before properties to disambiguate it from the events__pdi__person.properties column that we selected in the join. That's no longer the case and thus it seems like it was automatically removed.

'a%sd',
concat(event, ' ', replaceRegexpAll(JSONExtractRaw(properties, 'key'), '^"|"$', ''))
FROM events
INNER JOIN
(SELECT argMax(person_distinct_id2.person_id, version) AS person_id,
distinct_id
FROM person_distinct_id2
WHERE equals(team_id, 2)
GROUP BY distinct_id
HAVING equals(argMax(is_deleted, version), 0)) AS events__pdi ON equals(events.distinct_id, events__pdi.distinct_id)
INNER JOIN
(SELECT argMax(replaceRegexpAll(JSONExtractRaw(properties, 'email'), '^"|"$', ''), version) AS properties___email,
id
FROM person
WHERE equals(team_id, 2)
GROUP BY id
HAVING equals(argMax(is_deleted, version), 0)) AS events__pdi__person ON equals(events__pdi.person_id, events__pdi__person.id)
WHERE and(equals(team_id, 2), equals(events__pdi__person.properties___email, 'tom@posthog.com'), less(timestamp, '2020-01-10 12:14:05.000000'), greater(timestamp, '2020-01-09 12:00:00.000000'))
ORDER BY event ASC
LIMIT 101
OFFSET 0
'
---
# name: TestQuery.test_person_property_filter_materialized
'
/* user_id:0 request:_snapshot_ */
SELECT event,
events.distinct_id,
mat_key,
'a%sd',
concat(event, ' ', replaceRegexpAll(JSONExtractRaw(events.properties, 'key'), '^"|"$', ''))
concat(event, ' ', mat_key)
FROM events
INNER JOIN
(SELECT argMax(person_distinct_id2.person_id, version) AS person_id,
Expand All @@ -249,13 +278,13 @@
GROUP BY distinct_id
HAVING equals(argMax(is_deleted, version), 0)) AS events__pdi ON equals(events.distinct_id, events__pdi.distinct_id)
INNER JOIN
(SELECT argMax(person.properties, version) AS properties,
(SELECT argMax(pmat_email, version) AS properties___email,
id
FROM person
WHERE equals(team_id, 2)
GROUP BY id
HAVING equals(argMax(is_deleted, version), 0)) AS events__pdi__person ON equals(events__pdi.person_id, events__pdi__person.id)
WHERE and(equals(team_id, 2), equals(replaceRegexpAll(JSONExtractRaw(events__pdi__person.properties, 'email'), '^"|"$', ''), 'tom@posthog.com'), less(timestamp, '2020-01-10 12:14:05.000000'), greater(timestamp, '2020-01-09 12:00:00.000000'))
WHERE and(equals(team_id, 2), equals(events__pdi__person.properties___email, 'tom@posthog.com'), less(timestamp, '2020-01-10 12:14:05.000000'), greater(timestamp, '2020-01-09 12:00:00.000000'))
ORDER BY event ASC
LIMIT 101
OFFSET 0
Expand Down Expand Up @@ -332,7 +361,7 @@
# name: TestQuery.test_select_hogql_expressions.1
'
/* user_id:0 request:_snapshot_ */
SELECT tuple(uuid, event, events.properties, timestamp, team_id, events.distinct_id, elements_chain, events.created_at, events__pdi.person_id, events__pdi__person.created_at, events__pdi__person.properties),
SELECT tuple(uuid, event, properties, timestamp, team_id, events.distinct_id, elements_chain, events.created_at, events__pdi.person_id, events__pdi__person.created_at, events__pdi__person.properties___name, events__pdi__person.properties___email),
event
FROM events
INNER JOIN
Expand All @@ -344,14 +373,15 @@
HAVING equals(argMax(is_deleted, version), 0)) AS events__pdi ON equals(events.distinct_id, events__pdi.distinct_id)
INNER JOIN
(SELECT argMax(person.created_at, version) AS created_at,
argMax(person.properties, version) AS properties,
argMax(replaceRegexpAll(JSONExtractRaw(properties, 'name'), '^"|"$', ''), version) AS properties___name,
argMax(replaceRegexpAll(JSONExtractRaw(properties, 'email'), '^"|"$', ''), version) AS properties___email,
id
FROM person
WHERE equals(team_id, 2)
GROUP BY id
HAVING equals(argMax(is_deleted, version), 0)) AS events__pdi__person ON equals(events__pdi.person_id, events__pdi__person.id)
WHERE and(equals(team_id, 2), less(timestamp, '2020-01-10 12:14:05.000000'), greater(timestamp, '2020-01-09 12:00:00.000000'))
ORDER BY tuple(uuid, event, events.properties, timestamp, team_id, events.distinct_id, elements_chain, events.created_at, events__pdi.person_id, events__pdi__person.created_at, events__pdi__person.properties) ASC
ORDER BY tuple(uuid, event, properties, timestamp, team_id, events.distinct_id, elements_chain, events.created_at, events__pdi.person_id, events__pdi__person.created_at, events__pdi__person.properties___name, events__pdi__person.properties___email) ASC
LIMIT 101
OFFSET 0
'
Expand Down
3 changes: 1 addition & 2 deletions posthog/api/test/test_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,8 +178,7 @@ def test_event_property_filter(self):
response = self.client.post(f"/api/projects/{self.team.id}/query/", query.dict()).json()
self.assertEqual(len(response["results"]), 1)

# TODO: events query person property filters don't use materialized columns!
# @also_test_with_materialized_columns(event_properties=["key"], person_properties=["email"])
@also_test_with_materialized_columns(event_properties=["key"], person_properties=["email"])
@snapshot_clickhouse_queries
def test_person_property_filter(self):
with freeze_time("2020-01-10 12:00:00"):
Expand Down
4 changes: 4 additions & 0 deletions posthog/hogql/ast.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,10 @@ class PropertyRef(Ref):
name: str
parent: FieldRef

# The property has been moved into a field we query from a joined subquery
joined_subquery: Optional[SelectQueryAliasRef]
joined_subquery_field_name: Optional[str]

def get_child(self, name: str) -> "Ref":
raise NotImplementedError("JSON property traversal is not yet supported")

Expand Down
15 changes: 0 additions & 15 deletions posthog/hogql/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,21 +102,6 @@
# Keywords you can't alias to
RESERVED_KEYWORDS = KEYWORDS + ["team_id"]

# Allow-listed fields returned when you select "*" from events. Person and group fields will be nested later.
SELECT_STAR_FROM_EVENTS_FIELDS = [
"uuid",
"event",
"properties",
"timestamp",
"team_id",
"distinct_id",
"elements_chain",
"created_at",
"person_id",
"person.created_at",
"person.properties",
]

# Never return more rows than this in top level HogQL SELECT statements
DEFAULT_RETURNED_ROWS = 100
MAX_SELECT_RETURNED_ROWS = 65535
34 changes: 26 additions & 8 deletions posthog/hogql/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@ def get_field(self, name: str) -> DatabaseField:
def clickhouse_table(self):
raise NotImplementedError("Table.clickhouse_table not overridden")

def hogql_table(self):
raise NotImplementedError("Table.hogql_table not overridden")

def avoid_asterisk_fields(self) -> List[str]:
return []

Expand Down Expand Up @@ -76,7 +79,7 @@ class LazyTable(BaseModel):
class Config:
extra = Extra.forbid

join_function: Callable[[str, str, List[str]], Any]
join_function: Callable[[str, str, Dict[str, Any]], Any]
table: Table
from_field: str

Expand All @@ -101,6 +104,9 @@ class EventsPersonSubTable(VirtualTable):
def clickhouse_table(self):
return "events"

def hogql_table(self):
return "events"


class PersonsTable(Table):
id: StringDatabaseField = StringDatabaseField(name="id")
Expand All @@ -114,8 +120,11 @@ class PersonsTable(Table):
def clickhouse_table(self):
return "person"

def hogql_table(self):
return "persons"

def join_with_persons_table(from_table: str, to_table: str, requested_fields: List[str]):

def join_with_persons_table(from_table: str, to_table: str, requested_fields: Dict[str, Any]):
from posthog.hogql import ast

if not requested_fields:
Expand All @@ -127,9 +136,9 @@ def join_with_persons_table(from_table: str, to_table: str, requested_fields: Li
argmax_version: Callable[[ast.Expr], ast.Expr] = lambda field: ast.Call(
name="argMax", args=[field, ast.Field(chain=["version"])]
)
for field in requested_fields:
for field, expr in requested_fields.items():
if field != "id":
fields_to_select.append(ast.Alias(alias=field, expr=argmax_version(ast.Field(chain=[field]))))
fields_to_select.append(ast.Alias(alias=field, expr=argmax_version(expr)))

id = ast.Field(chain=["id"])

Expand Down Expand Up @@ -169,22 +178,25 @@ def avoid_asterisk_fields(self):
def clickhouse_table(self):
return "person_distinct_id2"

def hogql_table(self):
return "person_distinct_ids"


def join_with_max_person_distinct_id_table(from_table: str, to_table: str, requested_fields: List[str]):
def join_with_max_person_distinct_id_table(from_table: str, to_table: str, requested_fields: Dict[str, Any]):
from posthog.hogql import ast

if not requested_fields:
requested_fields = ["person_id"]
requested_fields = {"person_id": ast.Field(chain=["person_id"])}

# contains the list of fields we will select from this table
fields_to_select: List[ast.Expr] = []

argmax_version: Callable[[ast.Expr], ast.Expr] = lambda field: ast.Call(
name="argMax", args=[field, ast.Field(chain=["version"])]
)
for field in requested_fields:
for field, expr in requested_fields.items():
if field != "distinct_id":
fields_to_select.append(ast.Alias(alias=field, expr=argmax_version(ast.Field(chain=[field]))))
fields_to_select.append(ast.Alias(alias=field, expr=argmax_version(expr)))

distinct_id = ast.Field(chain=["distinct_id"])

Expand Down Expand Up @@ -233,6 +245,9 @@ class EventsTable(Table):
def clickhouse_table(self):
return "events"

def hogql_table(self):
return "events"


class SessionRecordingEvents(Table):
uuid: StringDatabaseField = StringDatabaseField(name="uuid")
Expand All @@ -259,6 +274,9 @@ class SessionRecordingEvents(Table):
def clickhouse_table(self):
return "session_recording_events"

def hogql_table(self):
return "session_recording_events"


class Database(BaseModel):
class Config:
Expand Down
7 changes: 5 additions & 2 deletions posthog/hogql/printer.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,9 @@ def print_ast(
# modify the cloned tree as needed
if dialect == "clickhouse":
expand_asterisks(node)
resolve_lazy_tables(node, stack)
# TODO: add team_id checks (currently done in the printer)
resolve_lazy_tables(node, stack, context)

# _Printer also adds a team_id guard if printing clickhouse
return _Printer(context=context, dialect=dialect, stack=stack or []).visit(node)


Expand Down Expand Up @@ -395,6 +395,9 @@ def visit_field_ref(self, ref: ast.FieldRef):
return field_sql

def visit_property_ref(self, ref: ast.PropertyRef):
if ref.joined_subquery is not None and ref.joined_subquery_field_name is not None:
return f"{self._print_identifier(ref.joined_subquery.name)}.{self._print_identifier(ref.joined_subquery_field_name)}"

field_ref = ref.parent
field = field_ref.resolve_database_field()

Expand Down
4 changes: 3 additions & 1 deletion posthog/hogql/resolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,9 @@ def visit_join_expr(self, node):
node.table.ref = self.visit(node.table)
if node.alias is not None:
if node.alias in scope.tables:
raise ResolverException(f'Already have joined a table called "{node.alias}". Can\'t redefine.')
raise ResolverException(
f'Already have joined a table called "{node.alias}". Can\'t join another one with the same name.'
)
node.ref = ast.SelectQueryAliasRef(name=node.alias, ref=node.table.ref)
scope.tables[node.alias] = node.ref
else:
Expand Down
21 changes: 17 additions & 4 deletions posthog/hogql/test/test_printer.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,16 +60,29 @@ def test_fields_and_properties(self):
"replaceRegexpAll(JSONExtractRaw(properties, %(hogql_val_0)s), '^\"|\"$', '')",
)

context = HogQLContext()
context = HogQLContext(within_non_hogql_query=True, using_person_on_events=False)
self.assertEqual(
self._expr("person.properties.bla", context),
"replaceRegexpAll(JSONExtractRaw(events__pdi__person.properties, %(hogql_val_0)s), '^\"|\"$', '')",
"replaceRegexpAll(JSONExtractRaw(person_props, %(hogql_val_0)s), '^\"|\"$', '')",
)

context = HogQLContext(within_non_hogql_query=True, using_person_on_events=False)
context = HogQLContext(within_non_hogql_query=True, using_person_on_events=True)
self.assertEqual(
self._expr("person.properties.bla", context),
"replaceRegexpAll(JSONExtractRaw(person_props, %(hogql_val_0)s), '^\"|\"$', '')",
"replaceRegexpAll(JSONExtractRaw(person_properties, %(hogql_val_0)s), '^\"|\"$', '')",
)

context = HogQLContext(within_non_hogql_query=False, using_person_on_events=False)
self.assertEqual(
self._expr("person.properties.bla", context),
"events__pdi__person.properties___bla",
)

context = HogQLContext(within_non_hogql_query=False, using_person_on_events=True)
self.assertEqual(
# TODO: for now, explicitly writing "poe." to opt in. Automatic switching will come soon.
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

HogQL is not yet able to turn PoE (in its current form) on or off depending on the team's settings. It is doing this inside existing insights (non-hogql queries), but not in full hogql queries. This is planned for another PR.

self._expr("poe.properties.bla", context),
"replaceRegexpAll(JSONExtractRaw(events.person_properties, %(hogql_val_0)s), '^\"|\"$', '')",
)

def test_hogql_properties(self):
Expand Down
Loading