Skip to content

Commit

Permalink
making dont merge an option for graph spec merge strategy
Browse files Browse the repository at this point in the history
  • Loading branch information
EvanDietzMorris committed Dec 19, 2024
1 parent 12edddc commit c840af7
Showing 1 changed file with 15 additions and 10 deletions.
25 changes: 15 additions & 10 deletions Common/kgx_file_merger.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,14 +45,11 @@ def merge(self,
for graph_source in chain(graph_spec.sources, graph_spec.subgraphs):
if graph_source.merge_strategy == 'default':
primary_sources.append(graph_source)
elif graph_source.merge_strategy == 'dont_merge_edges':
primary_sources.append(graph_source)
elif graph_source.merge_strategy == 'connected_edge_subset':
secondary_sources.append(graph_source)

# TODO we should be able to process a single primary source more efficiently (ie copy and paste it)
# if len(primary_sources) == 1:
# self.process_single_source(primary_sources[0], nodes_output_file_path, edges_output_file_path)
# else:

merge_metadata = {
'sources': {},
'final_node_count': 0,
Expand Down Expand Up @@ -110,10 +107,18 @@ def merge_primary_sources(self,
merge_metadata["sources"][graph_source.id][source_filename]["nodes"] = nodes_count

elif "edges" in file_path:
with jsonlines.open(file_path) as edges:
edges_count = graph_merger.merge_edges(edges)
merge_metadata["sources"][graph_source.id][source_filename]["edges"] = edges_count

if graph_source.merge_strategy == "default":
with jsonlines.open(file_path) as edges:
edges_count = graph_merger.merge_edges(edges)
merge_metadata["sources"][graph_source.id][source_filename]["edges"] = edges_count
elif graph_source.merge_strategy == "dont_merge_edges":
# just copy the lines from the data source edges file verbatim without merging
with jsonlines.open(file_path) as edges, open(edges_out_file, 'a') as edges_out:
edges_count = 0
for edge in edges:
edges_out.write(edge)
edges_count += 1
merge_metadata["sources"][graph_source.id][source_filename]["edges"] = edges_count
else:
raise ValueError(f"Did not recognize file {file_path} for merging "
f"from data source {graph_source.id}.")
Expand Down Expand Up @@ -177,7 +182,7 @@ def __write_back_to_file(self,

self.logger.debug(f'Writing merged edges to file...')
edges_written = 0
with open(edges_out_file, 'w') as edges_out:
with open(edges_out_file, 'a') as edges_out:
for edge_line in graph_merger.get_merged_edges_jsonl():
edges_out.write(edge_line)
edges_written += 1
Expand Down

0 comments on commit c840af7

Please sign in to comment.