-
Notifications
You must be signed in to change notification settings - Fork 2
/
Data_preparation.py
79 lines (67 loc) · 2.67 KB
/
Data_preparation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import os
import pandas as pd
from sqlalchemy import create_engine
import psycopg2
from psycopg2 import sql
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT
import sys
def get_metadata():
bookno = []
Title = []
Author = []
Language = []
content = []
for a in os.listdir(path="books"):
if a.endswith(".txt"):
with open(f"books/{a}", "r+", errors="ignore") as f:
text = f.read()
Title1 = []
Author1 = []
Language1 = []
for line in f:
y = line.split()
# raise Exception("The files is {}".format(f))
# print(len(y))
# if y[1]=='Title:':
if len(y) > 0 and y[0] == "Title:":
Title1 = y.copy()
if len(y) > 0 and y[0] == "Author:":
Author1 = y.copy()
if len(y) > 0 and y[0] == "Language:":
Language1 = y.copy()
bookno.append(a)
Title.append(Title1)
Author.append(Author1)
Language.append(Language1)
content.append(text)
df_books = pd.DataFrame()
stories = pd.DataFrame(columns=["bookno", "content"])
df_books["bookno"] = bookno
df_books["title"] = [" ".join(T) for T in Title]
df_books["author"] = [" ".join(A) for A in Author]
df_books["lang"] = [" ".join(L) for L in Language]
df_books["title"] = df_books["title"].str.lstrip("Title:")
df_books["author"] = df_books["author"].str.lstrip("Author:")
df_books["lang"] = df_books["lang"].str.lstrip("Language:")
stories["bookno"] = bookno
stories["content"] = content
return df_books, stories
def push_metadata_todb(user_name, password, db_name):
df_books, stories = get_metadata()
con = psycopg2.connect(
dbname="postgres", user=user_name, host="", password=password
)
con.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
cur = con.cursor()
cur.execute(sql.SQL("CREATE DATABASE {}").format(sql.Identifier(db_name)))
engine = create_engine(f"postgresql:///{db_name}")
df_books.to_sql("metadata", engine, if_exists="append", index=False)
stories.to_sql("short_stories", engine, if_exists="append", index=False)
return "Data uploaded to DB"
if __name__ == "__main__":
push_metadata_todb(sys.argv[1], sys.argv[2], sys.argv[3])
# import psycopg2
# con = psycopg2.connect(dbname="postgres", user="shubyog", host="", password=test)
# con.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
# cur = con.cursor()
# cur.execute(sql.SQL("CREATE DATABASE {}").format(sql.Identifier("test")))