-
Notifications
You must be signed in to change notification settings - Fork 3
/
build_dataset.py
90 lines (74 loc) · 2.9 KB
/
build_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import os
import requests
import tarfile
from tqdm import tqdm
from utils import read_jsonl
cur_dir = os.path.dirname(__file__)
out_dir = os.path.join(cur_dir, "resources")
dataset_links = read_jsonl(os.path.join(cur_dir, "dataset_links.jsonl"))
"""
Download files
"""
for x in dataset_links:
for url in x["urls"]:
fname = url.split("/")[-1]
out_path = os.path.join(out_dir, x["subdir"], fname)
# create output directory
if not os.path.exists(os.path.dirname(out_path)):
os.makedirs(os.path.dirname(out_path))
# check if file already exists
if os.path.isfile(out_path) or os.path.isfile(os.path.splitext(out_path)[0]):
print(f"{fname} already exists.")
continue
# download file
try:
r = requests.get(x["base_url"] + url, stream=True)
total_size = int(r.headers.get("content-length", 0))
block_size = 1024
t = tqdm(total=total_size, unit="iB", unit_scale=True, desc="Downloading " + fname)
with open(out_path, "wb") as f:
for data in r.iter_content(block_size):
t.update(len(data))
f.write(data)
t.close()
except KeyboardInterrupt:
if os.path.isfile(out_path):
os.remove(out_path)
print(f"Error downloading: {fname}")
except:
if os.path.isfile(out_path):
os.remove(out_path)
print(f"Error downloading: {fname}")
"""
Untar files
"""
for x in dataset_links:
fpath = os.path.join(out_dir, x["subdir"], x["urls"][0].split("/")[-1])
if fpath.endswith("bin.gz"): # fastText models
if os.path.isfile(os.path.splitext(fpath)[0]):
print(f"{os.path.splitext(fpath)[0]} already exists")
continue
print(f"gunzip {fpath}")
os.system(f"gunzip {fpath}")
continue
# tar.gz
if len(x["urls"]) > 1:
partname, _ = os.path.splitext(fpath)
if not os.path.exists(partname):
print(f"Combine parts of {partname}")
os.system(f"cat {partname}* > {partname}")
fpath = partname
print(f"Untar {fpath}")
tf = tarfile.open(fpath)
tf.extractall(path=os.path.dirname(fpath))
# move downloaded file from old icmr'20 dataset to new ijmir'21 folder structure
if x["name"] == "tamperednews_wordembeddings":
src_name = os.path.join(out_dir, "wordembeddings", "word_embeddings_nouns.h5")
dst_name = os.path.join(out_dir, "features", "tamperednews_noun_embeddings.h5")
print(src_name)
if os.path.isdir(os.path.dirname(src_name)):
if os.path.isfile(src_name):
print(f"mv {src_name} {dst_name}")
os.system(f"mv {src_name} {dst_name}")
print(f"rm -r {os.path.dirname(src_name)}")
os.system(f"rm -r {os.path.dirname(src_name)}")