-
Notifications
You must be signed in to change notification settings - Fork 0
/
otp_graph_import.py
249 lines (215 loc) · 11.5 KB
/
otp_graph_import.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
from typing import List, Set, Dict, Tuple
import sys
sys.path.append('..')
from shapely.geometry import Point, LineString
import numpy as np
import pandas as pd
import geopandas as gpd
import igraph as ig
import shapely.wkt
from pyproj import CRS
from common.igraph import Node, Edge
import common.igraph as ig_utils
import common.geometry as geom_utils
from common.logger import Logger
def convert_otp_graph_to_igraph(
node_csv_file: str,
edge_csv_file: str,
hma_poly_file: str,
igraph_out_file: str,
b_export_otp_data_to_gpkg: bool = False,
b_export_decomposed_igraphs_to_gpkg: bool = False,
b_export_final_graph_to_gpkg: bool = False,
debug_otp_graph_gpkg: str = 'debug/otp_graph_features.gpkg',
debug_igraph_gpkg: str = 'debug/otp2igraph_features.gpkg',
log: Logger = Logger(printing=True)
) -> dict:
hma_poly = geom_utils.project_geom(gpd.read_file(hma_poly_file)['geometry'][0])
# 1) read nodes nodes from CSV
n = pd.read_csv(node_csv_file, sep=';')
log.info(f'read {len(n.index)} nodes')
log.debug(f'node column types: {n.dtypes}')
log.debug(f'nodes head: {n.head()}')
log.info('creating node gdf')
n[Node.geometry.name] = [shapely.wkt.loads(geom) if isinstance(geom, str) else Point() for geom in n[Node.geometry.name]]
n[Node.geom_wgs.name] = n[Node.geometry.name]
n = gpd.GeoDataFrame(n, geometry=Node.geometry.name, crs=CRS.from_epsg(4326))
log.info('reprojecting nodes to etrs')
n = n.to_crs(epsg=3879)
log.debug(f'nodes head: {n.head()}')
# 2) read edges from CSV
e = pd.read_csv(edge_csv_file, sep=';')
log.info(f'read {len(e.index)} edges')
log.debug(f'edge column types: {e.dtypes}')
log.debug(f'edges head: {e.head()}')
log.info('creating edge gdf')
e[Edge.geometry.name] = [shapely.wkt.loads(geom) if isinstance(geom, str) else LineString() for geom in e[Edge.geometry.name]]
e[Edge.geom_wgs.name] = e[Edge.geometry.name]
e = gpd.GeoDataFrame(e, geometry=Edge.geometry.name, crs=CRS.from_epsg(4326))
log.info('reprojecting edges to etrs')
e = e.to_crs(epsg=3879)
log.debug(f'edges head: {e.head()}')
# 3) export graph data to gpkg
if (b_export_otp_data_to_gpkg == True):
log.info('writing otp graph data to gpkg')
e.drop(columns=[Edge.geom_wgs.name]).to_file(debug_otp_graph_gpkg, layer='edges', driver='GPKG')
log.info(f'exported edges to {debug_otp_graph_gpkg} (layer=edges)')
n.drop(columns=[Edge.geom_wgs.name]).to_file(debug_otp_graph_gpkg, layer='nodes', driver='GPKG')
log.info(f'exported nodes to {debug_otp_graph_gpkg} (layer=nodes)')
# 4) filter out edges that are unsuitable for both walking and cycling
def filter_df_by_query(df: pd.DataFrame, query: str, name: str = 'rows'):
count_before = len(df.index)
df_filt = df.query(query).copy()
filt_ratio = (count_before-len(df_filt.index)) / count_before
log.info(f'filtered out {count_before-len(df_filt.index)} {name} ({round(filt_ratio * 100, 1)} %) by {query}')
return df_filt
e_filt = filter_df_by_query(e, f'{Edge.allows_walking.name} == True or {Edge.allows_biking.name} == True', name='edges')
e_filt = filter_df_by_query(e_filt, f'{Edge.is_no_thru_traffic.name} == False', name='edges')
# 5) create a dictionaries for converting otp ids to ig ids and vice versa
log.debug('create maps for converting otp ids to ig ids')
n[Node.id_ig.name] = np.arange(len(n.index))
ids_otp_ig = {}
ids_ig_otp = {}
for node in n.itertuples():
ids_otp_ig[getattr(node, Node.id_otp.name)] = getattr(node, Node.id_ig.name)
ids_ig_otp[getattr(node, Node.id_ig.name)] = getattr(node, Node.id_otp.name)
# 6) add nodes to graph
log.info('adding nodes to graph')
G = ig.Graph(directed=True)
G.add_vertices(len(n.index))
for attr in Node:
if (attr.name in n.columns):
G.vs[attr.value] = list(n[attr.name])
else:
log.warning(f'node column {attr.name} not present in dataframe')
# 7) add edges to graph
log.info('adding edges to graph')
# get edge lengths by projected geometry
e_filt[Edge.length.name] = [round(geom.length, 4) if isinstance(geom, LineString) else 0.0 for geom in e_filt[Edge.geometry.name]]
def get_ig_uv(edge):
return (ids_otp_ig[edge['node_orig_id']], ids_otp_ig[edge['node_dest_id']])
e_filt['uv_ig'] = e_filt.apply(lambda row: get_ig_uv(row), axis=1)
e_filt[Edge.id_ig.name] = np.arange(len(e_filt.index))
G.add_edges(list(e_filt['uv_ig']))
for attr in Edge:
if (attr.name in e_filt.columns):
G.es[attr.value] = list(e_filt[attr.name])
else:
log.warning(f'edge column {attr.name} not present in dataframe')
# 8) delete edges outside Helsinki Metropolitan Area (HMA)
hma_buffered = hma_poly.buffer(100)
def intersects_hma(geom: LineString):
if (geom.is_empty == True): return True
return True if geom.intersects(hma_buffered) else False
e_gdf = ig_utils.get_edge_gdf(G)
log.info('finding edges that intersect with HMA')
e_gdf['in_hma'] = [intersects_hma(line) for line in e_gdf[Edge.geometry.name]]
e_gdf_del = e_gdf.query('in_hma == False').copy()
out_ratio = round(100 * len(e_gdf_del.index)/len(e_gdf.index), 1)
log.info(f'found {len(e_gdf_del.index)} ({out_ratio} %) edges outside HMA')
log.info('deleting edges')
before_count = G.ecount()
G.delete_edges(e_gdf_del.index.tolist())
after_count = G.ecount()
log.info(f'deleted {before_count-after_count} edges')
# check if id_ig:s need to be updated to edge attributes
mismatch_count = len([edge.index for edge in G.es if edge.attributes()[Edge.id_ig.value] != edge.index])
log.info(f'invalid edge ids: {mismatch_count}')
# reassign igraph indexes to edge and node attributes
G.es[Edge.id_ig.value] = [e.index for e in G.es]
G.vs[Node.id_ig.value] = [v.index for v in G.vs]
# check if id_ig:s need to be updated to edge attributes
mismatch_count = len([edge.index for edge in G.es if edge.attributes()[Edge.id_ig.value] != edge.index])
log.info(f'invalid edge ids: {mismatch_count} (after re-indexing)')
# 9) find and inspect subgraphs by decomposing the graph
sub_graphs = G.decompose(mode='STRONG')
log.info(f'found {len(sub_graphs)} subgraphs')
graph_sizes = [graph.ecount() for graph in sub_graphs]
log.info(f'subgraphs with more than 10 edges: {len([s for s in graph_sizes if s > 10])}')
log.info(f'subgraphs with more than 50 edges: {len([s for s in graph_sizes if s > 50])}')
log.info(f'subgraphs with more than 100 edges: {len([s for s in graph_sizes if s > 100])}')
log.info(f'subgraphs with more than 500 edges: {len([s for s in graph_sizes if s > 500])}')
log.info(f'subgraphs with more than 10000 edges: {len([s for s in graph_sizes if s > 10000])}')
small_graphs = [graph for graph in sub_graphs if graph.ecount() <= 15]
medium_graphs = [graph for graph in sub_graphs if (graph.ecount() > 15 and graph.ecount() <= 500)]
big_graphs = [graph for graph in sub_graphs if graph.ecount() > 500]
small_graph_edges = []
for graph_id, graph in enumerate(small_graphs):
edges = ig_utils.get_edge_dicts(graph, attrs=[Edge.id_otp, Edge.id_ig, Edge.geometry])
for edge in edges:
edge['graph_id'] = graph_id
small_graph_edges.extend(edges)
medium_graph_edges = []
for graph_id, graph in enumerate(medium_graphs):
edges = ig_utils.get_edge_dicts(graph, attrs=[Edge.id_otp, Edge.id_ig, Edge.geometry])
for edge in edges:
edge['graph_id'] = graph_id
medium_graph_edges.extend(edges)
big_graph_edges = []
for graph_id, graph in enumerate(big_graphs):
edges = ig_utils.get_edge_dicts(graph, attrs=[Edge.id_otp, Edge.id_ig, Edge.geometry])
for edge in edges:
edge['graph_id'] = graph_id
big_graph_edges.extend(edges)
if (b_export_decomposed_igraphs_to_gpkg == True):
log.info('exporting subgraphs to gpkg')
# graphs with <= 15 edges
small_graph_edges_gdf = gpd.GeoDataFrame(small_graph_edges, crs=CRS.from_epsg(3879))
small_graph_edges_gdf.to_file(debug_igraph_gpkg, layer='small_graph_edges', driver='GPKG')
# graphs with 15–500 edges
medium_graph_edges_gdf = gpd.GeoDataFrame(medium_graph_edges, crs=CRS.from_epsg(3879))
medium_graph_edges_gdf.to_file(debug_igraph_gpkg, layer='medium_graph_edges', driver='GPKG')
# graphs with > 500 edges
big_graph_edges_gdf = gpd.GeoDataFrame(big_graph_edges, crs=CRS.from_epsg(3879))
big_graph_edges_gdf.to_file(debug_igraph_gpkg, layer='big_graph_edges', driver='GPKG')
log.info(f'graphs exported')
# 10) delete smallest subgraphs from the graph
del_edge_ids = [edge[Edge.id_ig.name] for edge in small_graph_edges]
log.info(f'deleting {len(del_edge_ids)} isolated edges')
before_count = G.ecount()
G.delete_edges(del_edge_ids)
after_count = G.ecount()
del_ratio = round(100 * (before_count-after_count) / before_count, 1)
log.info(f'deleted {before_count-after_count} ({del_ratio} %) edges')
# 11) delete isolated nodes from the graph
del_node_ids = [v.index for v in G.vs.select(_degree_eq=0)]
log.info(f'deleting {len(del_node_ids)} isolated nodes')
before_count = G.vcount()
G.delete_vertices(del_node_ids)
after_count = G.vcount()
del_ratio = round(100 * (before_count-after_count) / before_count, 1)
log.info(f'deleted {before_count-after_count} ({del_ratio} %) nodes')
# check if id_ig:s need to be updated to edge attributes
mismatch_count = len([edge.index for edge in G.es if edge.attributes()[Edge.id_ig.value] != edge.index])
log.info(f'invalid edge ids: {mismatch_count}')
# reassign igraph indexes to edge and node attributes
G.es[Edge.id_ig.value] = [e.index for e in G.es]
G.vs[Node.id_ig.value] = [v.index for v in G.vs]
# check if id_ig:s need to be updated to edge attributes
mismatch_count = len([edge.index for edge in G.es if edge.attributes()[Edge.id_ig.value] != edge.index])
log.info(f'invalid edge ids: {mismatch_count} (after re-indexing)')
# 12) export graph data to GeoDataFrames fro debugging
if (b_export_final_graph_to_gpkg == True):
log.info(f'exporting final graph to {debug_igraph_gpkg} for debugging')
e_gdf = ig_utils.get_edge_gdf(G, attrs=[Edge.id_otp, Edge.id_ig], ig_attrs=['source', 'target'])
n_gdf = ig_utils.get_node_gdf(G, ig_attrs=['index'])
e_gdf.to_file(debug_igraph_gpkg, layer='final_graph_edges', driver='GPKG')
n_gdf.to_file(debug_igraph_gpkg, layer='final_graph_nodes', driver='GPKG')
if (igraph_out_file != None and igraph_out_file != ''):
ig_utils.export_to_graphml(G, igraph_out_file)
return G
if (__name__ == '__main__'):
log = Logger(printing=True, log_file='otp_graph_import.log', level='info')
graph = convert_otp_graph_to_igraph(
node_csv_file = 'otp_graph_data/otp_nodes.csv',
edge_csv_file = 'otp_graph_data/otp_edges.csv',
hma_poly_file = 'extent_data/HMA.geojson',
igraph_out_file = 'hma.graphml',
b_export_otp_data_to_gpkg = False,
b_export_decomposed_igraphs_to_gpkg = False,
b_export_final_graph_to_gpkg = False,
debug_otp_graph_gpkg = 'debug/otp_graph_features.gpkg',
debug_igraph_gpkg = 'debug/otp2igraph_features.gpkg',
log = log
)
log.info(f'created igraph of {graph.ecount()} edges and {graph.vcount()} nodes from OTP data')