Skip to content

Commit

Permalink
Merge pull request #41 from prrao87/fix-filenaming
Browse files Browse the repository at this point in the history
Update data generation and file naming
  • Loading branch information
prrao87 authored Aug 19, 2024
2 parents 25e6011 + db5d2f1 commit b688f6e
Show file tree
Hide file tree
Showing 22 changed files with 23 additions and 10 deletions.
Binary file removed cities.parquet
Binary file not shown.
6 changes: 4 additions & 2 deletions data/create_edges_follows.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@
while also keeping edges between nodes in a way that's not a uniform distribution.
In the real world, some people are way more connected than others.
"""

import argparse
from pathlib import Path
from typing import Any

import numpy as np
import polars as pl
Expand Down Expand Up @@ -68,7 +68,9 @@ def create_super_node_edges(persons_df: pl.DataFrame) -> pl.DataFrame:
# Take in the column val of num_connections and return a list of IDs from persons_df
super_nodes_df.with_columns(
pl.col("num_connections")
.map_elements(lambda x: select_random_ids(persons_df, x))
.map_elements(
lambda x: select_random_ids(persons_df, x), return_dtype=pl.List(pl.Int64)
)
.alias("connections")
)
# Explode the connections column to create a row for each connection
Expand Down
8 changes: 6 additions & 2 deletions data/create_edges_interests.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
Generate edges between persons and their interests
"""

import argparse
from pathlib import Path

Expand Down Expand Up @@ -37,7 +38,10 @@ def main() -> None:
# Take in the column val of num_connections and return a list of IDs from persons_df
persons_df.with_columns(
pl.col("num_interests")
.map_elements(lambda x: select_random_ids(interests_df, "interest_id", x))
.map_elements(
lambda x: select_random_ids(interests_df, "interest_id", x),
return_dtype=pl.List(pl.Int64),
)
.alias("interests")
)
# Explode the connections column to create a row for each connection
Expand All @@ -51,7 +55,7 @@ def main() -> None:
print(f"Limiting edges to {NUM} per the `--num` argument")
# Write nodes
edges_df = edges_df.rename({"id": "from", "interests": "to"})
edges_df.write_parquet(Path("output/edges") / "interests.parquet")
edges_df.write_parquet(Path("output/edges") / "interested_in.parquet")
print(f"Wrote {len(edges_df)} edges for {len(persons_df)} persons")


Expand Down
4 changes: 3 additions & 1 deletion data/create_edges_location.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
Generate edges between persons and their residence locations
"""

import argparse
from pathlib import Path

Expand Down Expand Up @@ -41,7 +42,8 @@ def main() -> None:
top_cities_df = (
city_counts_df.join(residence_loc_df, on="city_id", how="left")
# List top 5 cities
.sort("len", descending=True).head(5)
.sort("len", descending=True)
.head(5)
)
top_5 = top_cities_df["city"].to_list()
# Limit the number of edges
Expand Down
1 change: 1 addition & 0 deletions data/create_edges_location_city_state.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
Generate edges between cities and the states to which they belong
"""

from pathlib import Path

import polars as pl
Expand Down
1 change: 1 addition & 0 deletions data/create_edges_location_state_country.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
Generate edges between states and the countries to which they belong
"""

from pathlib import Path

import polars as pl
Expand Down
1 change: 1 addition & 0 deletions data/create_nodes_interests.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
Generate nodes for a person's interests
These are activities or hobbies person in the real world might have
"""

from pathlib import Path

import polars as pl
Expand Down
3 changes: 2 additions & 1 deletion data/create_nodes_location.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
Generate nodes for cities, states and countries
"""

import argparse
import unicodedata
from pathlib import Path
Expand Down Expand Up @@ -36,7 +37,7 @@ def get_cities_df(world_cities: pl.DataFrame) -> pl.DataFrame:
def write_city_nodes(cities_of_interest: pl.DataFrame) -> pl.DataFrame:
# Convert states column to ascii as it has problematic characters
cities_of_interest = cities_of_interest.with_columns(
pl.col("admin_name").map_elements(remove_accents)
pl.col("admin_name").map_elements(remove_accents, return_dtype=pl.String)
).drop("city")
# Rename columns
cities_of_interest = cities_of_interest.rename({"city_ascii": "city", "admin_name": "state"})
Expand Down
1 change: 1 addition & 0 deletions data/create_nodes_person.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
A 50-50% male/female profile distribution is used and names are
generated using the faker library.
"""

import argparse
from datetime import date
from pathlib import Path
Expand Down
Binary file modified data/output/edges/city_in.parquet
Binary file not shown.
Binary file modified data/output/edges/follows.parquet
Binary file not shown.
Binary file added data/output/edges/interested_in.parquet
Binary file not shown.
Binary file removed data/output/edges/interests.parquet
Binary file not shown.
Binary file modified data/output/edges/lives_in.parquet
Binary file not shown.
Binary file modified data/output/edges/state_in.parquet
Binary file not shown.
Binary file modified data/output/nodes/cities.parquet
Binary file not shown.
Binary file modified data/output/nodes/countries.parquet
Binary file not shown.
Binary file modified data/output/nodes/interests.parquet
Binary file not shown.
Binary file modified data/output/nodes/persons.parquet
Binary file not shown.
Binary file modified data/output/nodes/states.parquet
Binary file not shown.
2 changes: 1 addition & 1 deletion kuzudb/build_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ def main(conn: Connection) -> None:
create_edge_tables(conn)
conn.execute(f"COPY Follows FROM '{EDGES_PATH}/follows.parquet';")
conn.execute(f"COPY LivesIn FROM '{EDGES_PATH}/lives_in.parquet';")
conn.execute(f"COPY HasInterest FROM '{EDGES_PATH}/interests.parquet';")
conn.execute(f"COPY HasInterest FROM '{EDGES_PATH}/interested_in.parquet';")
conn.execute(f"COPY CityIn FROM '{EDGES_PATH}/city_in.parquet';")
conn.execute(f"COPY StateIn FROM '{EDGES_PATH}/state_in.parquet';")

Expand Down
6 changes: 3 additions & 3 deletions neo4j/build_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ async def merge_edges_person(tx: AsyncManagedTransaction, data: list[JsonBlob])
await tx.run(query, data=data)


async def merge_edges_interests(tx: AsyncManagedTransaction, data: list[JsonBlob]) -> None:
async def merge_edges_interested_in(tx: AsyncManagedTransaction, data: list[JsonBlob]) -> None:
query = """
UNWIND $data AS row
MATCH (p:Person {personID: row.from})
Expand Down Expand Up @@ -193,8 +193,8 @@ async def write_nodes(session: AsyncSession) -> None:
async def write_edges(session: AsyncSession) -> None:
await ingest_person_edges_in_batches(session, merge_edges_person)
# Write person-interest edges
interests = pl.read_parquet(f"{EDGES_PATH}/interests.parquet")
await session.execute_write(merge_edges_interests, data=interests.to_dicts())
interests = pl.read_parquet(f"{EDGES_PATH}/interested_in.parquet")
await session.execute_write(merge_edges_interested_in, data=interests.to_dicts())
# Write person-city edges
cities = pl.read_parquet(f"{EDGES_PATH}/lives_in.parquet")
await session.execute_write(merge_edges_lives_in, data=cities.to_dicts())
Expand Down

0 comments on commit b688f6e

Please sign in to comment.