-
Notifications
You must be signed in to change notification settings - Fork 0
/
smallDatasetsGenerator.py
136 lines (115 loc) · 5.86 KB
/
smallDatasetsGenerator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import pandas as pd
import random
import os
import pickle
from tqdm import tqdm
from trainingTablesPreprocessing import compute_table_ids
def generate_small_triple_dataset(n: int,
triple_file: str="/dati/home/francesco.pugnaloni/wikipedia_tables/processed_tables/test_samples_no_small_tables.csv"
) -> pd.DataFrame:
"""Generate a dataset of the desired size sampling from a larger one
Args:
n (int): number of rows of the new datset
triple_file (str, optional): path to the larger dataset (.csv). Defaults to "/dati/home/francesco.pugnaloni/wikipedia_tables/processed_tables/test_samples_no_small_tables.csv".
Returns:
pd.DataFrame: the new small dataset
"""
df = pd.read_csv(triple_file)
random.seed(42)
random_indices = random.sample(range(df.shape[0]), min(n, df.shape[0]))
random_dataset = df.iloc[random_indices][:]
random_dataset.reset_index(drop=True, inplace=True)
return random_dataset
def create_directory(directory_path) -> None:
"""Create a new directory
Args:
directory_path (_type_): path of the new directory
"""
try:
# Create a new directory at the specified path
os.mkdir(directory_path)
print("New directory created")
except FileExistsError:
pass
except Exception as e:
print(f"An error occurred: {str(e)}")
def generate_small_graph_dataset(full_graph_dataset: dict, indexes: set) -> dict:
"""Provided the list of indexes of the tables in the smaller dataset a smaller graph dictionary is generated
Args:
full_graph_dataset (dict): path to file containing all the graphs
indexes (set): indexes of the subset of necessary graphs
Returns:
dict: a new graph dictionary containing only the desired subset of graphs
"""
out = {}
for k in tqdm(indexes):
try:
out[str(k)] = full_graph_dataset[str(k)]
except:
out[str(int(k))] = full_graph_dataset[str(int(k))]
return out
def load_small_dataset(dir: str) -> dict:
"""Load the necessary file contained in the provided directory
Args:
dir (str): directory containing the files
Returns:
dict: a dictionary containing 'triples' and 'graphs'
"""
out = {}
out['triples'] = pd.read_csv(dir+'/samples.csv')
with open(dir+'/graphs.pkl','rb') as f:
out['graphs'] = pickle.load(f)
return out
def generate_small_datasets(length_list: list,
triple_file: str="/dati/home/francesco.pugnaloni/wikipedia_tables/processed_tables/test_samples_no_small_tables.csv",
graph_file: str="/dati/home/francesco.pugnaloni/wikipedia_tables/processed_tables/full_graphs_dict_with_id.pkl",
out_dir: str="/dati/home/francesco.pugnaloni/wikipedia_tables/small_tables"
) -> None:
"""Many small datasets will be generated from a large one in the provided directory
Args:
length_list (list): list of the lengths of the datasets to generate
triple_file (str, optional): path to the large triple file. Defaults to "/dati/home/francesco.pugnaloni/wikipedia_tables/processed_tables/test_samples_no_small_tables.csv".
graph_file (str, optional): path to the large graph_file. Defaults to "/dati/home/francesco.pugnaloni/wikipedia_tables/processed_tables/full_graphs_dict_with_id.pkl".
out_dir (str, optional): path to the directory where to save the new datasets. Defaults to "/dati/home/francesco.pugnaloni/wikipedia_tables/small_tables".
"""
print('Loading graph dictionary, it will take some time (~9/10 minutes)')
with open(graph_file, 'rb') as f:
gd = pickle.load(f)
create_directory(out_dir)
for n in length_list:
newdir = out_dir+'/'+str(n)+'_samples'
create_directory(newdir)
print(f'Generating new dataset of length {n}_____________________')
df = generate_small_triple_dataset(n, triple_file)
df.to_csv(newdir+'/samples.csv', index=False)
indexes = compute_table_ids(newdir+'/samples.csv', newdir+'/ids')
print('Small graph generation starts')
small_gd = generate_small_graph_dataset(gd, indexes)
with open(newdir+'/graphs.pkl', 'wb') as f:
pickle.dump(small_gd, f)
print(f'{len(length_list)} new dataset successfully generated in the desired directory')
def subsample_large_dataset(path_in: str, dir_out: str, n_sample: int=10000, n_split: int=5, seed: int=42) -> None:
"""Function to generate splits of a dataset
Args:
path_in (str): path to the input dataset
dir_out (str): path to the output directory
n_sample (int, optional): number of samples per split. Defaults to 10000.
n_split (int, optional): number of splits. Defaults to 5.
seed (int, optional): for randomization. Defaults to 42.
"""
random.seed(seed)
df = pd.read_csv(path_in)
df = df.sample(frac=1).reset_index(drop=True)
last_index = 0
for i in range(n_split):
create_directory(dir_out+"/"+str(i))
df.iloc[last_index:last_index+n_sample][:].to_csv(dir_out+"/"+str(i)+"/samples.csv", index=False)
if __name__ == '__main__':
# generate_small_datasets([10**1,10**2,10**3,10**4,10**5],
# graph_file="/dati/home/francesco.pugnaloni/wikipedia_tables/small_dataset_debug/graphs.pkl",
# triple_file="/dati/home/francesco.pugnaloni/wikipedia_tables/small_dataset_debug/triples.csv"
# )
# generate_small_datasets([10**1,10**2,10**3,10**4,10**5])
subsample_large_dataset("/dati/home/francesco.pugnaloni/wikipedia_tables/training_data/millions_of_tables/samples.csv",
"/home/francesco.pugnaloni/wikipedia_tables/dataset_test_lukas")
pass