diff --git a/README.md b/README.md index 24bfe88..ca6fbf6 100644 --- a/README.md +++ b/README.md @@ -76,8 +76,9 @@ The following questions are asked of both graphs: * **Query 6**: Which city has the maximum number of women that like Tennis? * **Query 7**: Which U.S. state has the maximum number of persons between the age 23-30 who enjoy photography? * **Query 8**: How many second-degree connections of persons are reachable in the graph? -* **Query 9**: Which 'influencers' (people with > 3K followers) below age 30 in the network follow the most people? -* **Query 10**: How many persons in the network are followed by people that follow influencers in the age range 18-25? +* **Query 9**: Which "influencers" (people with > 3K followers) younger than 30 follow the most people? +* **Query 10**: How many people are followed by "influencers" (people with > 3K followers) aged 18-25? + ## Performance comparison @@ -116,16 +117,16 @@ The following table shows the average run times for each query, and the speedup Query | Neo4j (sec) | KΓΉzu (sec) | Speedup factor --- | ---: | ---: | ---: -1 | 1.8677 | 0.2275650 | 8.2 -2 | 0.7052 | 0.2433142 | 2.9 -3 | 0.0056 | 0.0097056 | 0.6 -4 | 0.0541 | 0.0092325 | 5.9 -5 | 0.0074 | 0.0047592 | 1.6 -6 | 0.0210 | 0.0298077 | 0.7 -7 | 0.1618 | 0.0077759 | 20.8 -8 | 0.9019 | 0.1039609 | 8.7 -9 | 7.1976 | 0.8596641 | 8.4 -10 | 9.0518 | 0.7894154 | 11.5 +1 | 1.8578 | 0.2012965 | 9.2 +2 | 0.6384 | 0.2493954 | 2.6 +3 | 0.0405 | 0.0109885 | 3.7 +4 | 0.0471 | 0.0103636 | 4.5 +5 | 0.0084 | 0.0048151 | 1.7 +6 | 0.0218 | 0.0298180 | 0.7 +7 | 0.1634 | 0.0078995 | 20.7 +8 | 0.8726 | 0.1082653 | 8.1 +9 | 7.9377 | 0.8890417 | 8.9 +10 | 8.7908 | 0.7810308 | 11.2 #### Neo4j vs. KΓΉzu multi-threaded @@ -133,15 +134,15 @@ KΓΉzuDB (by default) supports multi-threaded execution of queries. The following Query | Neo4j (sec) | KΓΉzu (sec) | Speedup factor --- | ---: | ---: | ---: -1 | 1.8677 | 0.1361030 | 13.7 -2 | 0.7052 | 0.1259788 | 5.6 -3 | 0.0056 | 0.0072587 | 0.8 -4 | 0.0541 | 0.0080971 | 6.7 -5 | 0.0074 | 0.0050197 | 1.5 -6 | 0.0210 | 0.0124106 | 1.7 -7 | 0.1618 | 0.0066288 | 24.4 -8 | 0.9019 | 0.0236917 | 38.1 -9 | 7.1976 | 0.5698440 | 12.6 -10 | 9.0518 | 0.5460965 | 16.6 - -> πŸ”₯ The second-degree path finding query (8) shows the biggest speedup over Neo4j for the 100K node, 2.4M edge graph, and the average speedup over Neo4j across all queries when using KΓΉzu in multi-threaded mode is **~15x**. +1 | 1.8578 | 0.1450578 | 12.8 +2 | 0.6384 | 0.1281020 | 5.0 +3 | 0.0405 | 0.0081829 | 5.0 +4 | 0.0471 | 0.0079130 | 6.0 +5 | 0.0084 | 0.0048294 | 1.7 +6 | 0.0218 | 0.0125634 | 1.7 +7 | 0.1634 | 0.0065953 | 24.8 +8 | 0.8726 | 0.0250031 | 34.9 +9 | 7.9377 | 0.5911415 | 13.4 +10 | 8.7908 | 0.5632572 | 15.6 + +> πŸ”₯ The second-degree path finding query (8) shows the biggest speedup over Neo4j for the 100K node, 2.4M edge graph, and the average speedup over Neo4j across all queries when using KΓΉzu in multi-threaded mode is **~12x**. diff --git a/kuzudb/README.md b/kuzudb/README.md index 2696543..dfa5078 100644 --- a/kuzudb/README.md +++ b/kuzudb/README.md @@ -289,23 +289,24 @@ shape: (1, 5) Query 3: - MATCH (p:Person) -[:LivesIn]-> (c:City)-[*1..2]-> (co:Country {country: $country}) + MATCH (p:Person) -[:LivesIn]-> (c:City) -[*1..2]-> (co:Country) + WHERE co.country = $country RETURN c.city AS city, avg(p.age) AS averageAge ORDER BY averageAge LIMIT 5; -Cities with lowest average age in Canada: +Cities with lowest average age in United States: shape: (5, 2) -β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” -β”‚ city ┆ averageAge β”‚ -β”‚ --- ┆ --- β”‚ -β”‚ str ┆ f64 β”‚ -β•žβ•β•β•β•β•β•β•β•β•β•β•β•ͺ════════════║ -β”‚ Montreal ┆ 37.328018 β”‚ -β”‚ Calgary ┆ 37.607205 β”‚ -β”‚ Toronto ┆ 37.720255 β”‚ -β”‚ Edmonton ┆ 37.943678 β”‚ -β”‚ Vancouver ┆ 38.023227 β”‚ -β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ city ┆ averageAge β”‚ +β”‚ --- ┆ --- β”‚ +β”‚ str ┆ f64 β”‚ +β•žβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•ͺ════════════║ +β”‚ Louisville ┆ 37.099473 β”‚ +β”‚ Denver ┆ 37.202703 β”‚ +β”‚ San Francisco ┆ 37.26213 β”‚ +β”‚ Tampa ┆ 37.327765 β”‚ +β”‚ Nashville ┆ 37.343006 β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ Query 4: @@ -407,7 +408,51 @@ shape: (1, 1) β•žβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•‘ β”‚ 1214477 β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ -Queries completed in 1.2756s + +Query 9: + + MATCH (:Person)-[r1:Follows]->(influencer:Person)-[r2:Follows]->(:Person) + WITH count(r1) AS numFollowers, influencer, id(r2) as r2ID + WHERE influencer.age <= $age_upper AND numFollowers > 3000 + RETURN influencer.id AS influencerId, influencer.name AS name, count(r2ID) AS numFollows + ORDER BY numFollows DESC LIMIT 5; + + + Influencers below age 30 who follow the most people: +shape: (5, 3) +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ influencerId ┆ name ┆ numFollows β”‚ +β”‚ --- ┆ --- ┆ --- β”‚ +β”‚ i64 ┆ str ┆ i64 β”‚ +β•žβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•ͺ═════════════════β•ͺ════════════║ +β”‚ 89758 ┆ Joshua Williams ┆ 40 β”‚ +β”‚ 1348 ┆ Brett Wright ┆ 32 β”‚ +β”‚ 8077 ┆ Ralph Floyd ┆ 32 β”‚ +β”‚ 85914 ┆ Micheal Holt ┆ 32 β”‚ +β”‚ 2386 ┆ Robert Graham ┆ 31 β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + + +Query 10: + + MATCH (:Person)-[r1:Follows]->(influencer:Person)-[r2:Follows]->(person:Person) + WITH count(id(r1)) AS numFollowers1, person, influencer, id(r2) as r2ID + WHERE influencer.age >= $age_lower AND influencer.age <= $age_upper AND numFollowers1 > 3000 + RETURN count(r2ID) AS numFollowers2 + ORDER BY numFollowers2 DESC LIMIT 5; + + + Number of people followed by influencers in the age range 18-25: +shape: (1, 1) +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ numFollowers2 β”‚ +β”‚ --- β”‚ +β”‚ i64 β”‚ +β•žβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•‘ +β”‚ 690 β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + +Queries completed in 2.7552s ``` #### Query performance benchmark (KΓΉzu single-threaded) @@ -416,69 +461,69 @@ The benchmark is run using `pytest-benchmark` package as follows. ```sh $ pytest benchmark_query.py --benchmark-min-rounds=5 --benchmark-warmup-iterations=5 --benchmark-disable-gc --benchmark-sort=fullname -====================================== test session starts ======================================= +========================================= test session starts ========================================== platform darwin -- Python 3.11.2, pytest-7.4.0, pluggy-1.2.0 benchmark: 4.0.0 (defaults: timer=time.perf_counter disable_gc=True min_rounds=5 min_time=0.000005 max_time=1.0 calibration_precision=10 warmup=False warmup_iterations=5) rootdir: /code/kuzudb-study/kuzudb plugins: Faker-19.2.0, anyio-3.7.1, benchmark-4.0.0 -collected 10 items +collected 10 items -benchmark_query.py .......... [100%] +benchmark_query.py .......... [100%] -------------------------------------------------------------------------------------- benchmark: 10 tests -------------------------------------------------------------------------------------- Name (time in ms) Min Max Mean StdDev Median IQR Outliers OPS Rounds Iterations ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -test_benchmark_query1 204.5318 (47.91) 264.5275 (37.83) 227.5650 (47.82) 24.9057 (78.87) 220.8040 (47.48) 39.0229 (167.82) 1;0 4.3943 (0.02) 5 1 -test_benchmark_query10 781.3306 (183.02) 801.5248 (114.61) 789.4154 (165.87) 8.1112 (25.69) 789.5400 (169.76) 11.9668 (51.47) 1;0 1.2668 (0.01) 5 1 -test_benchmark_query2 237.0291 (55.52) 253.8298 (36.30) 243.3142 (51.13) 6.3798 (20.20) 241.2695 (51.88) 6.8318 (29.38) 1;0 4.1099 (0.02) 5 1 -test_benchmark_query3 8.5850 (2.01) 10.6163 (1.52) 9.7056 (2.04) 0.3943 (1.25) 9.7931 (2.11) 0.4372 (1.88) 25;4 103.0336 (0.49) 76 1 -test_benchmark_query4 8.6458 (2.03) 10.8680 (1.55) 9.2325 (1.94) 0.5057 (1.60) 9.0836 (1.95) 0.6175 (2.66) 22;1 108.3130 (0.52) 74 1 -test_benchmark_query5 4.2691 (1.0) 6.9932 (1.0) 4.7592 (1.0) 0.4073 (1.29) 4.6508 (1.0) 0.6067 (2.61) 11;1 210.1198 (1.0) 81 1 -test_benchmark_query6 27.7651 (6.50) 31.7360 (4.54) 29.8077 (6.26) 0.9314 (2.95) 29.8461 (6.42) 1.0849 (4.67) 8;0 33.5484 (0.16) 33 1 -test_benchmark_query7 6.9663 (1.63) 8.5708 (1.23) 7.7759 (1.63) 0.3158 (1.0) 7.7834 (1.67) 0.2325 (1.0) 18;14 128.6021 (0.61) 85 1 -test_benchmark_query8 100.7867 (23.61) 110.9126 (15.86) 103.9609 (21.84) 2.9900 (9.47) 103.4902 (22.25) 1.8488 (7.95) 3;2 9.6190 (0.05) 10 1 -test_benchmark_query9 853.4369 (199.91) 867.1944 (124.00) 859.6641 (180.63) 6.7229 (21.29) 856.5395 (184.17) 12.5563 (54.00) 2;0 1.1632 (0.01) 5 1 +test_benchmark_query1 187.6215 (42.79) 215.1583 (40.93) 201.2965 (41.81) 10.8608 (34.30) 200.3082 (41.39) 17.0231 (62.64) 2;0 4.9678 (0.02) 5 1 +test_benchmark_query10 769.6619 (175.53) 801.6863 (152.51) 781.0308 (162.21) 13.8731 (43.81) 773.9472 (159.92) 21.4789 (79.04) 1;0 1.2804 (0.01) 5 1 +test_benchmark_query2 224.0829 (51.10) 263.8683 (50.20) 249.3954 (51.79) 16.1696 (51.07) 256.7539 (53.05) 22.2899 (82.02) 1;0 4.0097 (0.02) 5 1 +test_benchmark_query3 10.1482 (2.31) 12.0562 (2.29) 10.9885 (2.28) 0.3852 (1.22) 11.0705 (2.29) 0.2835 (1.04) 11;9 91.0040 (0.44) 44 1 +test_benchmark_query4 8.7894 (2.00) 19.4709 (3.70) 10.3636 (2.15) 1.8377 (5.80) 9.7671 (2.02) 1.1838 (4.36) 5;5 96.4919 (0.46) 69 1 +test_benchmark_query5 4.3848 (1.0) 5.2565 (1.0) 4.8151 (1.0) 0.3166 (1.0) 4.8396 (1.0) 0.6334 (2.33) 15;0 207.6812 (1.0) 32 1 +test_benchmark_query6 28.5645 (6.51) 31.3298 (5.96) 29.8180 (6.19) 0.6957 (2.20) 29.9759 (6.19) 1.1007 (4.05) 11;0 33.5368 (0.16) 32 1 +test_benchmark_query7 7.0635 (1.61) 8.9225 (1.70) 7.8995 (1.64) 0.3691 (1.17) 7.8556 (1.62) 0.2718 (1.0) 18;15 126.5904 (0.61) 71 1 +test_benchmark_query8 99.0060 (22.58) 123.4725 (23.49) 108.2653 (22.48) 7.8141 (24.68) 107.2657 (22.16) 12.5305 (46.11) 3;0 9.2366 (0.04) 10 1 +test_benchmark_query9 854.6426 (194.91) 932.6182 (177.42) 889.0417 (184.64) 33.3944 (105.46) 874.2172 (180.64) 55.3503 (203.68) 2;0 1.1248 (0.01) 5 1 ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- Legend: Outliers: 1 Standard Deviation from Mean; 1.5 IQR (InterQuartile Range) from 1st Quartile and 3rd Quartile. OPS: Operations Per Second, computed as 1 / Mean -====================================== 10 passed in 20.95s ======================================= +========================================= 10 passed in 20.84s ========================================== ``` #### Query performance (KΓΉzu multi-threaded) ```sh $ pytest benchmark_query.py --benchmark-min-rounds=5 --benchmark-warmup-iterations=5 --benchmark-disable-gc --benchmark-sort=fullname -====================================== test session starts ======================================= +========================================= test session starts ========================================== platform darwin -- Python 3.11.2, pytest-7.4.0, pluggy-1.2.0 benchmark: 4.0.0 (defaults: timer=time.perf_counter disable_gc=True min_rounds=5 min_time=0.000005 max_time=1.0 calibration_precision=10 warmup=False warmup_iterations=5) rootdir: /code/kuzudb-study/kuzudb plugins: Faker-19.2.0, anyio-3.7.1, benchmark-4.0.0 -collected 10 items +collected 10 items -benchmark_query.py .......... [100%] +benchmark_query.py .......... [100%] -------------------------------------------------------------------------------------- benchmark: 10 tests -------------------------------------------------------------------------------------- Name (time in ms) Min Max Mean StdDev Median IQR Outliers OPS Rounds Iterations ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -test_benchmark_query1 112.0785 (27.29) 217.2179 (36.61) 136.1030 (27.11) 45.5192 (136.94) 115.3392 (23.11) 33.0138 (106.23) 1;1 7.3474 (0.04) 5 1 -test_benchmark_query10 531.5120 (129.40) 557.1382 (93.90) 546.0965 (108.79) 10.4145 (31.33) 547.9619 (109.79) 16.7082 (53.76) 2;0 1.8312 (0.01) 5 1 -test_benchmark_query2 120.3370 (29.30) 132.8606 (22.39) 125.9788 (25.10) 4.2911 (12.91) 125.2396 (25.09) 5.7256 (18.42) 3;0 7.9378 (0.04) 7 1 -test_benchmark_query3 6.4401 (1.57) 8.0799 (1.36) 7.2587 (1.45) 0.3847 (1.16) 7.2864 (1.46) 0.5087 (1.64) 30;0 137.7665 (0.69) 78 1 -test_benchmark_query4 7.0398 (1.71) 9.8535 (1.66) 8.0971 (1.61) 0.4228 (1.27) 8.0239 (1.61) 0.5342 (1.72) 23;1 123.5016 (0.62) 87 1 -test_benchmark_query5 4.1076 (1.0) 5.9335 (1.0) 5.0197 (1.0) 0.3324 (1.0) 4.9908 (1.0) 0.3108 (1.0) 17;9 199.2147 (1.0) 79 1 -test_benchmark_query6 11.4065 (2.78) 13.9336 (2.35) 12.4106 (2.47) 0.5122 (1.54) 12.3276 (2.47) 0.5818 (1.87) 20;2 80.5766 (0.40) 72 1 -test_benchmark_query7 5.9218 (1.44) 9.0174 (1.52) 6.6288 (1.32) 0.4273 (1.29) 6.5931 (1.32) 0.4345 (1.40) 30;1 150.8580 (0.76) 104 1 -test_benchmark_query8 22.5029 (5.48) 27.1075 (4.57) 23.6917 (4.72) 0.9087 (2.73) 23.4097 (4.69) 0.9917 (3.19) 10;1 42.2088 (0.21) 41 1 -test_benchmark_query9 565.3163 (137.63) 578.1635 (97.44) 569.8440 (113.52) 5.5017 (16.55) 567.1719 (113.64) 8.3636 (26.91) 1;0 1.7549 (0.01) 5 1 +test_benchmark_query1 113.0014 (28.40) 245.0873 (44.68) 145.0578 (30.04) 56.2668 (185.80) 123.1903 (25.58) 43.0426 (136.11) 1;1 6.8938 (0.03) 5 1 +test_benchmark_query10 540.2672 (135.81) 627.5628 (114.40) 563.2572 (116.63) 36.4845 (120.48) 550.6263 (114.33) 31.8747 (100.80) 1;1 1.7754 (0.01) 5 1 +test_benchmark_query2 123.9972 (31.17) 132.6335 (24.18) 128.1020 (26.53) 2.8402 (9.38) 127.9745 (26.57) 3.6837 (11.65) 2;0 7.8063 (0.04) 7 1 +test_benchmark_query3 7.3900 (1.86) 9.4403 (1.72) 8.1829 (1.69) 0.4120 (1.36) 8.0602 (1.67) 0.4682 (1.48) 19;1 122.2061 (0.59) 63 1 +test_benchmark_query4 7.1140 (1.79) 9.1029 (1.66) 7.9130 (1.64) 0.3950 (1.30) 7.7692 (1.61) 0.6243 (1.97) 24;0 126.3748 (0.61) 82 1 +test_benchmark_query5 3.9783 (1.0) 5.4857 (1.0) 4.8294 (1.0) 0.3028 (1.0) 4.8161 (1.0) 0.3434 (1.09) 16;2 207.0671 (1.0) 64 1 +test_benchmark_query6 11.2117 (2.82) 13.8597 (2.53) 12.5634 (2.60) 0.5755 (1.90) 12.4724 (2.59) 0.8934 (2.83) 19;0 79.5963 (0.38) 66 1 +test_benchmark_query7 5.8453 (1.47) 7.3942 (1.35) 6.5953 (1.37) 0.3223 (1.06) 6.5524 (1.36) 0.3162 (1.0) 27;6 151.6239 (0.73) 84 1 +test_benchmark_query8 22.7547 (5.72) 30.6260 (5.58) 25.0031 (5.18) 1.7501 (5.78) 24.6404 (5.12) 2.7679 (8.75) 11;1 39.9951 (0.19) 38 1 +test_benchmark_query9 586.4883 (147.42) 605.2226 (110.33) 591.1415 (122.41) 7.9578 (26.28) 587.3650 (121.96) 6.5644 (20.76) 1;1 1.6916 (0.01) 5 1 ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- Legend: Outliers: 1 Standard Deviation from Mean; 1.5 IQR (InterQuartile Range) from 1st Quartile and 3rd Quartile. OPS: Operations Per Second, computed as 1 / Mean -====================================== 10 passed in 15.52s ======================================= +========================================= 10 passed in 15.53s ========================================== ``` diff --git a/kuzudb/benchmark_query.py b/kuzudb/benchmark_query.py index d825a1d..1772f01 100644 --- a/kuzudb/benchmark_query.py +++ b/kuzudb/benchmark_query.py @@ -19,7 +19,7 @@ def connection(): db = kuzu.Database(f"./social_network") conn = kuzu.Connection(db) # For a fairer comparison with Neo4j, where β€œTransactions are single-threaded, confined, and independent.” - conn.set_max_threads_for_exec(1) + # conn.set_max_threads_for_exec(1) yield conn @@ -49,15 +49,15 @@ def test_benchmark_query2(benchmark, connection): def test_benchmark_query3(benchmark, connection): - result = benchmark(query.run_query3, connection, [("country", "Canada")]) + result = benchmark(query.run_query3, connection, [("country", "United States")]) result = result.to_dicts() assert len(result) == 5 - assert result[0]["city"] == "Montreal" - assert result[1]["city"] == "Calgary" - assert result[2]["city"] == "Toronto" - assert result[3]["city"] == "Edmonton" - assert result[4]["city"] == "Vancouver" + assert result[0]["city"] == "Louisville" + assert result[1]["city"] == "Denver" + assert result[2]["city"] == "San Francisco" + assert result[3]["city"] == "Tampa" + assert result[4]["city"] == "Nashville" def test_benchmark_query4(benchmark, connection): diff --git a/kuzudb/query.py b/kuzudb/query.py index c6e93d7..21a7b61 100644 --- a/kuzudb/query.py +++ b/kuzudb/query.py @@ -42,7 +42,7 @@ def run_query2(conn: Connection) -> None: def run_query3(conn: Connection, params: list[tuple[str, Any]]) -> None: "Which 5 cities in a particular country have the lowest average age in the network?" query = """ - MATCH (p:Person) -[:LivesIn]-> (c:City)-[:CityIn]-> (:State) -[:StateIn]-> (co:Country) + MATCH (p:Person) -[:LivesIn]-> (c:City) -[*1..2]-> (co:Country) WHERE co.country = $country RETURN c.city AS city, avg(p.age) AS averageAge ORDER BY averageAge LIMIT 5; @@ -146,7 +146,7 @@ def run_query8(conn: Connection) -> None: def run_query9(conn: Connection, params: list[tuple[str, Any]]) -> None: - "Which 'influencers' (people with > 3K followers) below a certain age in the network follow the most people?" + "Which 'influencers' (people with > 3K followers) below a certain age follow the most people?" query = """ MATCH (:Person)-[r1:Follows]->(influencer:Person)-[r2:Follows]->(:Person) WITH count(r1) AS numFollowers, influencer, id(r2) as r2ID @@ -167,7 +167,7 @@ def run_query9(conn: Connection, params: list[tuple[str, Any]]) -> None: def run_query10(conn: Connection, params: list[tuple[str, Any]]) -> None: - "How many people in the network are followed by 'influencers' (people with > 3K followers) within a certain age range in the network?" + "How many people are followed by 'influencers' (people with > 3K followers) within a certain age range?" # TODO: Change the query to avoid having to use id(r1) when the projection pushdown analyzer in KΓΉzu is implemented, see PR #23 for details query = """ MATCH (:Person)-[r1:Follows]->(influencer:Person)-[r2:Follows]->(person:Person) @@ -181,7 +181,7 @@ def run_query10(conn: Connection, params: list[tuple[str, Any]]) -> None: result = pl.from_arrow(response.get_as_arrow(chunk_size=1000)) print( f""" - Number of people followed by people who follow influencers between the age of {params[0][1]}-{params[1][1]}:\n{result} + Number of people followed by influencers in the age range {params[0][1]}-{params[1][1]}:\n{result} """ ) return result @@ -191,7 +191,7 @@ def main(conn: Connection) -> None: with Timer(name="queries", text="Queries completed in {:.4f}s"): _ = run_query1(conn) _ = run_query2(conn) - _ = run_query3(conn, params=[("country", "Canada")]) + _ = run_query3(conn, params=[("country", "United States")]) _ = run_query4(conn, params=[("age_lower", 30), ("age_upper", 40)]) _ = run_query5( conn, diff --git a/neo4j/README.md b/neo4j/README.md index c17f199..7e6b1e8 100644 --- a/neo4j/README.md +++ b/neo4j/README.md @@ -74,11 +74,11 @@ The following questions are asked of the graph: ``` Query 1: - + MATCH (follower:Person)-[:FOLLOWS]->(person:Person) RETURN person.personID AS personID, person.name AS name, count(follower) AS numFollowers ORDER BY numFollowers DESC LIMIT 3 - + Top 3 most-followed persons: shape: (3, 3) β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” @@ -92,13 +92,13 @@ shape: (3, 3) β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ Query 2: - + MATCH (follower:Person) -[:FOLLOWS]-> (person:Person) WITH person, count(follower) as followers ORDER BY followers DESC LIMIT 1 MATCH (person) -[:LIVES_IN]-> (city:City) RETURN person.name AS name, followers AS numFollowers, city.city AS city, city.state AS state, city.country AS country - + City in which most-followed person lives: shape: (1, 5) β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” @@ -110,32 +110,33 @@ shape: (1, 5) β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ Query 3: - - MATCH (p:Person) -[:LIVES_IN]-> (c:City) -[*1..2]-> (co:Country {country: $country}) + + MATCH (p:Person) -[:LIVES_IN]-> (c:City) -[*1..2]-> (co:Country) + WHERE co.country = $country RETURN c.city AS city, avg(p.age) AS averageAge ORDER BY averageAge LIMIT 5 - -Cities with lowest average age in Canada: + +Cities with lowest average age in United States: shape: (5, 2) -β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” -β”‚ city ┆ averageAge β”‚ -β”‚ --- ┆ --- β”‚ -β”‚ str ┆ f64 β”‚ -β•žβ•β•β•β•β•β•β•β•β•β•β•β•ͺ════════════║ -β”‚ Montreal ┆ 37.328018 β”‚ -β”‚ Calgary ┆ 37.607205 β”‚ -β”‚ Toronto ┆ 37.720255 β”‚ -β”‚ Edmonton ┆ 37.943678 β”‚ -β”‚ Vancouver ┆ 38.023227 β”‚ -β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ city ┆ averageAge β”‚ +β”‚ --- ┆ --- β”‚ +β”‚ str ┆ f64 β”‚ +β•žβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•ͺ════════════║ +β”‚ Louisville ┆ 37.099473 β”‚ +β”‚ Denver ┆ 37.202703 β”‚ +β”‚ San Francisco ┆ 37.26213 β”‚ +β”‚ Tampa ┆ 37.327765 β”‚ +β”‚ Nashville ┆ 37.343006 β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ Query 4: - + MATCH (p:Person)-[:LIVES_IN]->(ci:City)-[*1..2]->(country:Country) WHERE p.age >= $age_lower AND p.age <= $age_upper RETURN country.country AS countries, count(country) AS personCounts ORDER BY personCounts DESC LIMIT 3 - + Persons between ages 30-40 in each country: shape: (3, 2) β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” @@ -149,7 +150,7 @@ shape: (3, 2) β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ Query 5: - + MATCH (p:Person)-[:HAS_INTEREST]->(i:Interest) WHERE tolower(i.interest) = tolower($interest) AND tolower(p.gender) = tolower($gender) @@ -157,7 +158,7 @@ Query 5: MATCH (p)-[:LIVES_IN]->(c:City) WHERE c.city = $city AND c.country = $country RETURN count(p) AS numPersons - + Number of male users in London, United Kingdom who have an interest in fine dining: shape: (1, 1) β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” @@ -169,7 +170,7 @@ shape: (1, 1) β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ Query 6: - + MATCH (p:Person)-[:HAS_INTEREST]->(i:Interest) WHERE tolower(i.interest) = tolower($interest) AND tolower(p.gender) = tolower($gender) @@ -177,7 +178,7 @@ Query 6: MATCH (p)-[:LIVES_IN]->(c:City) RETURN count(p) AS numPersons, c.city AS city, c.country AS country ORDER BY numPersons DESC LIMIT 5 - + Cities with the most female users who have an interest in tennis: shape: (5, 3) β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” @@ -193,7 +194,7 @@ shape: (5, 3) β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ Query 7: - + MATCH (p:Person)-[:LIVES_IN]->(:City)-[:CITY_IN]->(s:State) WHERE p.age >= $age_lower AND p.age <= $age_upper AND s.country = $country WITH p, s @@ -201,7 +202,7 @@ Query 7: WHERE tolower(i.interest) = tolower($interest) RETURN count(p) AS numPersons, s.state AS state, s.country AS country ORDER BY numPersons DESC LIMIT 1 - + State in United States with the most users between ages 23-30 who have an interest in photography: shape: (1, 3) @@ -212,14 +213,14 @@ shape: (1, 3) β•žβ•β•β•β•β•β•β•β•β•β•β•β•β•ͺ════════════β•ͺ═══════════════║ β”‚ 170 ┆ California ┆ United States β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - + Query 8: - + MATCH (p1:Person)-[f:FOLLOWS]->(p2:Person) WHERE p1.personID > p2.personID RETURN count(f) as numFollowers - + Number of second degree connections reachable in the graph: shape: (1, 1) β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” @@ -229,7 +230,51 @@ shape: (1, 1) β•žβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•‘ β”‚ 1214477 β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ -Neo4j query script completed in 3.344930s + +Query 9: + + MATCH (:Person)-[r1:FOLLOWS]->(influencer:Person)-[r2:FOLLOWS]->(:Person) + WITH count(r1) AS numFollowers, influencer, r2 + WHERE influencer.age <= $age_upper AND numFollowers > 3000 + RETURN influencer.id AS influencerId, influencer.name AS name, count(r2) AS numFollows + ORDER BY numFollows DESC LIMIT 5; + + + Influencers below age 30 who follow the most people: +shape: (5, 3) +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ influencerId ┆ name ┆ numFollows β”‚ +β”‚ --- ┆ --- ┆ --- β”‚ +β”‚ i64 ┆ str ┆ i64 β”‚ +β•žβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•ͺ═════════════════β•ͺ════════════║ +β”‚ 89758 ┆ Joshua Williams ┆ 40 β”‚ +β”‚ 85914 ┆ Micheal Holt ┆ 32 β”‚ +β”‚ 8077 ┆ Ralph Floyd ┆ 32 β”‚ +β”‚ 1348 ┆ Brett Wright ┆ 32 β”‚ +β”‚ 70809 ┆ David Cooper ┆ 31 β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + + +Query 10: + + MATCH (:Person)-[r1:FOLLOWS]->(influencer:Person)-[r2:FOLLOWS]->(person:Person) + WITH count(r1) AS numFollowers1, person, influencer, r2 + WHERE influencer.age >= $age_lower AND influencer.age <= $age_upper AND numFollowers1 > 3000 + RETURN count(r2) AS numFollowers2 + ORDER BY numFollowers2 DESC LIMIT 5; + + + Number of people followed by influencers in the age range 18-25: +shape: (1, 1) +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ numFollowers2 β”‚ +β”‚ --- β”‚ +β”‚ i64 β”‚ +β•žβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•‘ +β”‚ 690 β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + +Neo4j query script completed in 20.313278s ``` ### Query performance benchmark @@ -238,34 +283,34 @@ The benchmark is run using `pytest-benchmark` package as follows. ```sh $ pytest benchmark_query.py --benchmark-min-rounds=5 --benchmark-warmup-iterations=5 --benchmark-disable-gc --benchmark-sort=fullname -====================================================================================== test session starts ====================================================================================== +================================================= test session starts ================================================== platform darwin -- Python 3.11.2, pytest-7.4.0, pluggy-1.2.0 benchmark: 4.0.0 (defaults: timer=time.perf_counter disable_gc=True min_rounds=5 min_time=0.000005 max_time=1.0 calibration_precision=10 warmup=False warmup_iterations=5) rootdir: /code/kuzudb-study/neo4j plugins: Faker-19.2.0, anyio-3.7.1, benchmark-4.0.0 collected 10 items -benchmark_query.py .......... [100%] +benchmark_query.py .......... [100%] --------------------------------------------------------------------------------- benchmark: 10 tests --------------------------------------------------------------------------------- Name (time in s) Min Max Mean StdDev Median IQR Outliers OPS Rounds Iterations --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -test_benchmark_query1 1.7573 (380.15) 1.9808 (222.81) 1.8677 (331.94) 0.0888 (111.73) 1.8518 (342.74) 0.1397 (203.88) 2;0 0.5354 (0.00) 5 1 -test_benchmark_query10 8.9709 (>1000.0) 9.1463 (>1000.0) 9.0518 (>1000.0) 0.0727 (91.49) 9.0622 (>1000.0) 0.1210 (176.57) 2;0 0.1105 (0.00) 5 1 -test_benchmark_query2 0.6699 (144.90) 0.7536 (84.77) 0.7052 (125.34) 0.0326 (41.00) 0.6946 (128.57) 0.0456 (66.56) 2;0 1.4179 (0.01) 5 1 -test_benchmark_query3 0.0046 (1.0) 0.0089 (1.0) 0.0056 (1.0) 0.0008 (1.0) 0.0054 (1.0) 0.0008 (1.14) 19;6 177.7288 (1.0) 115 1 -test_benchmark_query4 0.0470 (10.16) 0.0714 (8.03) 0.0541 (9.62) 0.0071 (8.94) 0.0504 (9.33) 0.0097 (14.19) 3;0 18.4732 (0.10) 15 1 -test_benchmark_query5 0.0062 (1.35) 0.0118 (1.33) 0.0074 (1.31) 0.0010 (1.29) 0.0070 (1.30) 0.0007 (1.0) 11;8 135.4245 (0.76) 90 1 -test_benchmark_query6 0.0188 (4.06) 0.0395 (4.44) 0.0210 (3.74) 0.0033 (4.11) 0.0203 (3.75) 0.0015 (2.13) 1;5 47.5523 (0.27) 41 1 -test_benchmark_query7 0.1589 (34.38) 0.1659 (18.66) 0.1618 (28.76) 0.0022 (2.76) 0.1614 (29.87) 0.0021 (3.08) 2;0 6.1794 (0.03) 7 1 -test_benchmark_query8 0.8673 (187.61) 0.9557 (107.50) 0.9019 (160.30) 0.0330 (41.50) 0.8978 (166.17) 0.0344 (50.14) 2;0 1.1087 (0.01) 5 1 -test_benchmark_query9 7.0078 (>1000.0) 7.5807 (852.71) 7.1976 (>1000.0) 0.2214 (278.46) 7.1423 (>1000.0) 0.1712 (249.88) 1;1 0.1389 (0.00) 5 1 +test_benchmark_query1 1.7685 (252.71) 2.0853 (145.03) 1.8578 (221.68) 0.1292 (114.35) 1.8186 (224.85) 0.1059 (107.66) 1;1 0.5383 (0.00) 5 1 +test_benchmark_query10 8.6340 (>1000.0) 8.9443 (622.06) 8.7908 (>1000.0) 0.1103 (97.55) 8.7834 (>1000.0) 0.0985 (100.09) 2;0 0.1138 (0.00) 5 1 +test_benchmark_query2 0.6305 (90.09) 0.6483 (45.09) 0.6384 (76.17) 0.0074 (6.57) 0.6386 (78.95) 0.0125 (12.73) 2;0 1.5665 (0.01) 5 1 +test_benchmark_query3 0.0380 (5.43) 0.0480 (3.34) 0.0405 (4.83) 0.0029 (2.52) 0.0395 (4.88) 0.0023 (2.37) 4;4 24.7145 (0.21) 22 1 +test_benchmark_query4 0.0419 (5.98) 0.0624 (4.34) 0.0471 (5.62) 0.0051 (4.54) 0.0453 (5.61) 0.0053 (5.34) 5;2 21.2382 (0.18) 23 1 +test_benchmark_query5 0.0070 (1.0) 0.0144 (1.0) 0.0084 (1.0) 0.0011 (1.0) 0.0081 (1.0) 0.0010 (1.0) 11;6 119.3207 (1.0) 87 1 +test_benchmark_query6 0.0200 (2.86) 0.0268 (1.86) 0.0218 (2.60) 0.0015 (1.32) 0.0214 (2.64) 0.0013 (1.36) 8;5 45.8986 (0.38) 42 1 +test_benchmark_query7 0.1595 (22.79) 0.1753 (12.19) 0.1634 (19.50) 0.0055 (4.85) 0.1613 (19.95) 0.0034 (3.45) 1;1 6.1194 (0.05) 7 1 +test_benchmark_query8 0.8537 (122.00) 0.8821 (61.35) 0.8726 (104.12) 0.0112 (9.92) 0.8737 (108.02) 0.0119 (12.06) 1;0 1.1460 (0.01) 5 1 +test_benchmark_query9 7.5164 (>1000.0) 8.3269 (579.12) 7.9377 (947.13) 0.3325 (294.18) 7.8846 (974.85) 0.5492 (558.27) 2;0 0.1260 (0.00) 5 1 --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- Legend: Outliers: 1 Standard Deviation from Mean; 1.5 IQR (InterQuartile Range) from 1st Quartile and 3rd Quartile. OPS: Operations Per Second, computed as 1 / Mean -========================================================================= 10 passed in 144.49s (0:02:24) ========================================================================== +============================================ 10 passed in 147.13s (0:02:27) ============================================ ``` \ No newline at end of file diff --git a/neo4j/benchmark_query.py b/neo4j/benchmark_query.py index 542eac8..c5d7833 100644 --- a/neo4j/benchmark_query.py +++ b/neo4j/benchmark_query.py @@ -49,15 +49,15 @@ def test_benchmark_query2(benchmark, session): def test_benchmark_query3(benchmark, session): - result = benchmark(query.run_query3, session, "Canada") + result = benchmark(query.run_query3, session, "United States") result = result.to_dicts() assert len(result) == 5 - assert result[0]["city"] == "Montreal" - assert result[1]["city"] == "Calgary" - assert result[2]["city"] == "Toronto" - assert result[3]["city"] == "Edmonton" - assert result[4]["city"] == "Vancouver" + assert result[0]["city"] == "Louisville" + assert result[1]["city"] == "Denver" + assert result[2]["city"] == "San Francisco" + assert result[3]["city"] == "Tampa" + assert result[4]["city"] == "Nashville" def test_benchmark_query4(benchmark, session): diff --git a/neo4j/query.py b/neo4j/query.py index 90d5005..2297104 100644 --- a/neo4j/query.py +++ b/neo4j/query.py @@ -49,7 +49,8 @@ def run_query2(session: Session) -> None: def run_query3(session: Session, country: str) -> None: "Which 5 cities in a particular country have the lowest average age in the network?" query = """ - MATCH (p:Person) -[:LIVES_IN]-> (c:City) -[*1..2]-> (co:Country {country: $country}) + MATCH (p:Person) -[:LIVES_IN]-> (c:City) -[*1..2]-> (co:Country) + WHERE co.country = $country RETURN c.city AS city, avg(p.age) AS averageAge ORDER BY averageAge LIMIT 5 """ @@ -154,7 +155,7 @@ def run_query8(session: Session) -> None: def run_query9(session: Session, age_upper: int) -> None: - "Which 'influencers' (persons followed by more than 3K people) below a certain age in the network follow the most people?" + "Which 'influencers' (persons followed by more than 3K people) below a certain age follow the most people?" query = """ MATCH (:Person)-[r1:FOLLOWS]->(influencer:Person)-[r2:FOLLOWS]->(:Person) WITH count(r1) AS numFollowers, influencer, r2 @@ -175,7 +176,7 @@ def run_query9(session: Session, age_upper: int) -> None: def run_query10(session: Session, age_lower: int, age_upper: int) -> None: - "How many people in the network are followed by 'influencers' (people with > 3K followers) within a certain age range in the network?" + "How many people are followed by 'influencers' (people with > 3K followers) within a certain age range?" query = """ MATCH (:Person)-[r1:FOLLOWS]->(influencer:Person)-[r2:FOLLOWS]->(person:Person) WITH count(r1) AS numFollowers1, person, influencer, r2 @@ -189,7 +190,7 @@ def run_query10(session: Session, age_lower: int, age_upper: int) -> None: result = pl.from_dicts(response.data()) print( f""" - Number of people followed by people who follow influencers between the age of {age_lower}-{age_upper}:\n{result} + Number of people followed by influencers in the age range {age_lower}-{age_upper}:\n{result} """ ) return result @@ -202,7 +203,7 @@ def main() -> None: # fmt: off _ = run_query1(session) _ = run_query2(session) - _ = run_query3(session, country="Canada") + _ = run_query3(session, country="United States") _ = run_query4(session, age_lower=30, age_upper=40) _ = run_query5(session, gender="male", city="London", country="United Kingdom", interest="fine dining") _ = run_query6(session, gender="female", interest="tennis")