-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.py
47 lines (38 loc) · 1.29 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import asyncio
from playwright.async_api import async_playwright
from itertools import groupby
JS_GET_TEXT_NODE = """
root => {
const walker = document.createTreeWalker(
root,
NodeFilter.SHOW_TEXT,
);
let node;
const textNodes = []
while(node = walker.nextNode()) {
const style = getComputedStyle(node.parentElement);
textNodes.push([style.fontFamily, style.fontStyle, node.nodeValue])
}
return textNodes;
}
"""
async def scrape(url):
async with async_playwright() as p:
browser = await p.chromium.launch()
page = await browser.new_page()
await page.goto(url)
await page.evaluate("document.fonts.ready") # wait for loading completed
textNodes = await page.locator("body").evaluate(JS_GET_TEXT_NODE)
def key_func(node):
return (node[0].split(",")[0].replace('"', ""), node[1])
def text_node(key, group):
text = "".join([item[2] for item in group])
return {
"name": key[0],
"style": key[1],
"characters": "".join(sorted(set(text))),
}
return [text_node(k,g) for k, g in groupby(sorted(textNodes, key=key_func), key=key_func)]
if __name__ == "__main__":
URL = "http://localhost:5173"
print(asyncio.run(scrape(URL)))