-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape.py
142 lines (112 loc) · 4.58 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
from os import environ
from google.oauth2.service_account import Credentials
from googleapiclient.discovery import build
from pymongo.mongo_client import MongoClient
import utils
from datetime import datetime
from dotenv import load_dotenv
import requests
load_dotenv()
SERVICE_ACCOUNT_FILE = 'credentials.json'
SCOPES = ['https://www.googleapis.com/auth/spreadsheets.readonly']
credentials = Credentials.from_service_account_file(SERVICE_ACCOUNT_FILE, scopes=SCOPES)
service = build('sheets', 'v4', credentials=credentials)
SPREADSHEET_ID = environ.get('SPREADSHEET_ID')
SHEET_NAMES = {
'easy': 'Easy',
'normal': 'Normal',
'hard': 'Hard',
'harder': 'Harder',
'insane': 'Insane',
'easydemon': 'Easy Demon',
'mediumdemon': 'Medium Demon',
'harddemon': 'Hard Demon',
'insanedemon': 'Insane Demon',
'extremededemon': 'Extreme Demon'
}
async def calculate_range(sheet_name, column):
sheet = service.spreadsheets()
result = sheet.values().get(spreadsheetId=SPREADSHEET_ID, range=f'{sheet_name}!{column}2:{column}').execute()
values = result.get('values', [])
return len(values) + 1
async def get_sheet_id(sheet_name):
sheet_metadata = service.spreadsheets().get(spreadsheetId=SPREADSHEET_ID).execute()
sheets = sheet_metadata.get('sheets', [])
for sheet in sheets:
if sheet.get('properties', {}).get('title') == sheet_name:
return sheet.get('properties', {}).get('sheetId')
return None
async def scrape_google_sheet(client: MongoClient):
values = []
for name in SHEET_NAMES.values():
sheet_id = await get_sheet_id(name)
last_row = await calculate_range(name, 'A')
sheet = service.spreadsheets()
result = sheet.values().get(spreadsheetId=SPREADSHEET_ID,
range=f'{name}!C2:F{last_row}').execute()
data = list(result.get('values', []))
if data:
for i in range(len(data)):
data[i].insert(3, utils.generate_cell_link(SPREADSHEET_ID, sheet_id, i + 2))
values += data
parsed_data = utils.parse_data(values)
collection = client['secretways']['levels']
updated_ids = set()
for key, value in parsed_data.items():
existing_record = collection.find_one({'_id': int(key)})
if not existing_record:
value['timestamp'] = datetime.now()
collection.update_one({'_id': int(key)}, {'$set': value}, upsert=True)
updated_ids.add(int(key))
collection.delete_many({'_id': {'$nin': list(updated_ids)}})
def get_all_ids(client: MongoClient) -> list[str]:
collection = client['secretways']['levels']
return [str(doc['_id']) for doc in collection.find({}, {'_id': 1})]
async def scrape_robtop_api(client: MongoClient):
allIds = get_all_ids(client)
url = "https://www.boomlings.com/database/getGJLevels21.php"
headers = {
"User-Agent": ""
}
data = {
"onlyCompleted": 1,
"secret": "Wmfd2893gb7",
}
levels, creators, songs = [], set(), set()
while True:
data["completedLevels"] = f"({','.join(allIds[:10])})",
del allIds[:10]
req = requests.post(url=url, data=data, headers=headers)
reqParsed = utils.robtop_to_data(req.text)
if len(reqParsed[0]) == 0: #shouldn't be necessary but just in case
break
levels += reqParsed[0]
for creator in reqParsed[1]:
creators.add(tuple(creator.items()))
for song in reqParsed[2]:
songs.add(tuple(song.items()))
if len(allIds) == 0:
break
creators = [dict(creator) for creator in creators]
songs = [dict(song) for song in songs]
levelCollection = client['robtop']['levels']
creatorCollection = client['robtop']['creators']
songCollection = client['robtop']['songs']
updated_ids = set()
for level in levels:
id = level.pop('1')
levelCollection.update_one({'_id': int(id)}, {'$set': level}, upsert=True)
updated_ids.add(int(id))
levelCollection.delete_many({'_id': {'$nin': list(updated_ids)}})
updated_ids.clear()
for creator in creators:
id = creator.pop('userID')
creatorCollection.update_one({'_id': int(id)}, {'$set': creator}, upsert=True)
updated_ids.add(int(id))
creatorCollection.delete_many({'_id': {'$nin': list(updated_ids)}})
updated_ids.clear()
for song in songs:
id = song.pop('1')
songCollection.update_one({'_id': int(id)}, {'$set': song}, upsert=True)
updated_ids.add(int(id))
songCollection.delete_many({'_id': {'$nin': list(updated_ids)}})