diff --git a/cities.parquet b/cities.parquet deleted file mode 100644 index 13386d4..0000000 Binary files a/cities.parquet and /dev/null differ diff --git a/data/create_edges_follows.py b/data/create_edges_follows.py index 3c4f7a9..fa0e7db 100644 --- a/data/create_edges_follows.py +++ b/data/create_edges_follows.py @@ -5,9 +5,9 @@ while also keeping edges between nodes in a way that's not a uniform distribution. In the real world, some people are way more connected than others. """ + import argparse from pathlib import Path -from typing import Any import numpy as np import polars as pl @@ -68,7 +68,9 @@ def create_super_node_edges(persons_df: pl.DataFrame) -> pl.DataFrame: # Take in the column val of num_connections and return a list of IDs from persons_df super_nodes_df.with_columns( pl.col("num_connections") - .map_elements(lambda x: select_random_ids(persons_df, x)) + .map_elements( + lambda x: select_random_ids(persons_df, x), return_dtype=pl.List(pl.Int64) + ) .alias("connections") ) # Explode the connections column to create a row for each connection diff --git a/data/create_edges_interests.py b/data/create_edges_interests.py index f6590f4..79f8853 100644 --- a/data/create_edges_interests.py +++ b/data/create_edges_interests.py @@ -1,6 +1,7 @@ """ Generate edges between persons and their interests """ + import argparse from pathlib import Path @@ -37,7 +38,10 @@ def main() -> None: # Take in the column val of num_connections and return a list of IDs from persons_df persons_df.with_columns( pl.col("num_interests") - .map_elements(lambda x: select_random_ids(interests_df, "interest_id", x)) + .map_elements( + lambda x: select_random_ids(interests_df, "interest_id", x), + return_dtype=pl.List(pl.Int64), + ) .alias("interests") ) # Explode the connections column to create a row for each connection @@ -51,7 +55,7 @@ def main() -> None: print(f"Limiting edges to {NUM} per the `--num` argument") # Write nodes edges_df = edges_df.rename({"id": "from", "interests": "to"}) - edges_df.write_parquet(Path("output/edges") / "interests.parquet") + edges_df.write_parquet(Path("output/edges") / "interested_in.parquet") print(f"Wrote {len(edges_df)} edges for {len(persons_df)} persons") diff --git a/data/create_edges_location.py b/data/create_edges_location.py index 2487310..8893e01 100644 --- a/data/create_edges_location.py +++ b/data/create_edges_location.py @@ -1,6 +1,7 @@ """ Generate edges between persons and their residence locations """ + import argparse from pathlib import Path @@ -41,7 +42,8 @@ def main() -> None: top_cities_df = ( city_counts_df.join(residence_loc_df, on="city_id", how="left") # List top 5 cities - .sort("len", descending=True).head(5) + .sort("len", descending=True) + .head(5) ) top_5 = top_cities_df["city"].to_list() # Limit the number of edges diff --git a/data/create_edges_location_city_state.py b/data/create_edges_location_city_state.py index 8a5c05b..3488c19 100644 --- a/data/create_edges_location_city_state.py +++ b/data/create_edges_location_city_state.py @@ -1,6 +1,7 @@ """ Generate edges between cities and the states to which they belong """ + from pathlib import Path import polars as pl diff --git a/data/create_edges_location_state_country.py b/data/create_edges_location_state_country.py index c63daf9..28f3fea 100644 --- a/data/create_edges_location_state_country.py +++ b/data/create_edges_location_state_country.py @@ -1,6 +1,7 @@ """ Generate edges between states and the countries to which they belong """ + from pathlib import Path import polars as pl diff --git a/data/create_nodes_interests.py b/data/create_nodes_interests.py index a07d72f..94be939 100644 --- a/data/create_nodes_interests.py +++ b/data/create_nodes_interests.py @@ -2,6 +2,7 @@ Generate nodes for a person's interests These are activities or hobbies person in the real world might have """ + from pathlib import Path import polars as pl diff --git a/data/create_nodes_location.py b/data/create_nodes_location.py index d7486e8..43c0adb 100644 --- a/data/create_nodes_location.py +++ b/data/create_nodes_location.py @@ -1,6 +1,7 @@ """ Generate nodes for cities, states and countries """ + import argparse import unicodedata from pathlib import Path @@ -36,7 +37,7 @@ def get_cities_df(world_cities: pl.DataFrame) -> pl.DataFrame: def write_city_nodes(cities_of_interest: pl.DataFrame) -> pl.DataFrame: # Convert states column to ascii as it has problematic characters cities_of_interest = cities_of_interest.with_columns( - pl.col("admin_name").map_elements(remove_accents) + pl.col("admin_name").map_elements(remove_accents, return_dtype=pl.String) ).drop("city") # Rename columns cities_of_interest = cities_of_interest.rename({"city_ascii": "city", "admin_name": "state"}) diff --git a/data/create_nodes_person.py b/data/create_nodes_person.py index 81c41d9..5a52a0a 100644 --- a/data/create_nodes_person.py +++ b/data/create_nodes_person.py @@ -3,6 +3,7 @@ A 50-50% male/female profile distribution is used and names are generated using the faker library. """ + import argparse from datetime import date from pathlib import Path diff --git a/data/output/edges/city_in.parquet b/data/output/edges/city_in.parquet index fd20e8b..08fdc38 100644 Binary files a/data/output/edges/city_in.parquet and b/data/output/edges/city_in.parquet differ diff --git a/data/output/edges/follows.parquet b/data/output/edges/follows.parquet index 75b280e..cb35614 100644 Binary files a/data/output/edges/follows.parquet and b/data/output/edges/follows.parquet differ diff --git a/data/output/edges/interested_in.parquet b/data/output/edges/interested_in.parquet new file mode 100644 index 0000000..2a794ac Binary files /dev/null and b/data/output/edges/interested_in.parquet differ diff --git a/data/output/edges/interests.parquet b/data/output/edges/interests.parquet deleted file mode 100644 index 7608c8c..0000000 Binary files a/data/output/edges/interests.parquet and /dev/null differ diff --git a/data/output/edges/lives_in.parquet b/data/output/edges/lives_in.parquet index 5009d89..2129348 100644 Binary files a/data/output/edges/lives_in.parquet and b/data/output/edges/lives_in.parquet differ diff --git a/data/output/edges/state_in.parquet b/data/output/edges/state_in.parquet index 1eba51d..f977799 100644 Binary files a/data/output/edges/state_in.parquet and b/data/output/edges/state_in.parquet differ diff --git a/data/output/nodes/cities.parquet b/data/output/nodes/cities.parquet index 8737f66..2a8ecc8 100644 Binary files a/data/output/nodes/cities.parquet and b/data/output/nodes/cities.parquet differ diff --git a/data/output/nodes/countries.parquet b/data/output/nodes/countries.parquet index f8ec99a..231a95a 100644 Binary files a/data/output/nodes/countries.parquet and b/data/output/nodes/countries.parquet differ diff --git a/data/output/nodes/interests.parquet b/data/output/nodes/interests.parquet index 480d23a..f08604b 100644 Binary files a/data/output/nodes/interests.parquet and b/data/output/nodes/interests.parquet differ diff --git a/data/output/nodes/persons.parquet b/data/output/nodes/persons.parquet index 7baa8a3..64397b3 100644 Binary files a/data/output/nodes/persons.parquet and b/data/output/nodes/persons.parquet differ diff --git a/data/output/nodes/states.parquet b/data/output/nodes/states.parquet index e0b8e75..526a3c4 100644 Binary files a/data/output/nodes/states.parquet and b/data/output/nodes/states.parquet differ diff --git a/kuzudb/build_graph.py b/kuzudb/build_graph.py index 8ece48d..72ad76a 100644 --- a/kuzudb/build_graph.py +++ b/kuzudb/build_graph.py @@ -114,7 +114,7 @@ def main(conn: Connection) -> None: create_edge_tables(conn) conn.execute(f"COPY Follows FROM '{EDGES_PATH}/follows.parquet';") conn.execute(f"COPY LivesIn FROM '{EDGES_PATH}/lives_in.parquet';") - conn.execute(f"COPY HasInterest FROM '{EDGES_PATH}/interests.parquet';") + conn.execute(f"COPY HasInterest FROM '{EDGES_PATH}/interested_in.parquet';") conn.execute(f"COPY CityIn FROM '{EDGES_PATH}/city_in.parquet';") conn.execute(f"COPY StateIn FROM '{EDGES_PATH}/state_in.parquet';") diff --git a/neo4j/build_graph.py b/neo4j/build_graph.py index f095a82..0533368 100644 --- a/neo4j/build_graph.py +++ b/neo4j/build_graph.py @@ -92,7 +92,7 @@ async def merge_edges_person(tx: AsyncManagedTransaction, data: list[JsonBlob]) await tx.run(query, data=data) -async def merge_edges_interests(tx: AsyncManagedTransaction, data: list[JsonBlob]) -> None: +async def merge_edges_interested_in(tx: AsyncManagedTransaction, data: list[JsonBlob]) -> None: query = """ UNWIND $data AS row MATCH (p:Person {personID: row.from}) @@ -193,8 +193,8 @@ async def write_nodes(session: AsyncSession) -> None: async def write_edges(session: AsyncSession) -> None: await ingest_person_edges_in_batches(session, merge_edges_person) # Write person-interest edges - interests = pl.read_parquet(f"{EDGES_PATH}/interests.parquet") - await session.execute_write(merge_edges_interests, data=interests.to_dicts()) + interests = pl.read_parquet(f"{EDGES_PATH}/interested_in.parquet") + await session.execute_write(merge_edges_interested_in, data=interests.to_dicts()) # Write person-city edges cities = pl.read_parquet(f"{EDGES_PATH}/lives_in.parquet") await session.execute_write(merge_edges_lives_in, data=cities.to_dicts())