-
Notifications
You must be signed in to change notification settings - Fork 1
/
readable.ts
131 lines (108 loc) · 3.44 KB
/
readable.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#!/usr/bin/env -S deno run --allow-net --allow-read --allow-write --allow-env=HTTPS_PROXY,LC_ALL,LC_MESSAGES,LANG,LANGUAGE --no-prompt --no-check --
const version = "2.4.5"
import * as path from "https://deno.land/std@0.201.0/path/mod.ts"
import yargs from "https://deno.land/x/yargs@v17.7.2-deno/deno.ts"
import y18n from "https://deno.land/x/y18n@v5.0.8-deno/deno.ts"
import { initParser, DOMParser, DOMParserMimeType, Document, Element } from "https://deno.land/x/deno_dom@v0.1.38/deno-dom-wasm-noinit.ts"
import * as ammonia from "https://deno.land/x/ammonia@0.3.1/mod.ts"
import { Buffer } from "node:buffer"
import fs from "node:fs"
import process from "node:process"
import { Readability, isProbablyReaderable } from "npm:@mozilla/readability@^0.4.4"
import UserAgent from "npm:user-agents@1.0"
// GNU gettext gives preference to LANGUAGE above all else, but this order is consistent with Yargs:
const locale = (
Deno.env.get("LC_ALL") ||
Deno.env.get("LC_MESSAGES") ||
Deno.env.get("LANG") ||
Deno.env.get("LANGUAGE") ||
"en_US"
).replace(/[.:].*/, '')
const __ = y18n({
locale: locale,
updateFiles: false,
directory: path.join(path.dirname(path.fromFileUrl(import.meta.url)), "locales")
}).__
function printVersion() {
console.log(`readability-cli v${version}`)
console.log(`Deno ${Deno.version.deno}`)
}
async function parseDOMFromURL(url: string, _proxy: string, _strictSSL: boolean, userAgent: string) {
const initParserPromise = initParser()
const userAgentString = userAgent ?? new UserAgent({ deviceCategory: "desktop" }).toString()
const response = await fetch(url, {
headers: {
"User-Agent": userAgentString
}
})
if (!response.ok) {
throw {
statusCode: response.status,
response: {
statusMessage: response.statusText
}
}
}
const text = await response.text()
await initParserPromise
const contentType = response.headers.get("Content-Type")!
let mimeType = contentType.slice(0, contentType.indexOf(';'))
if (mimeType == "text/htm")
mimeType = "text/html"
return parseDOM(text, url, mimeType as DOMParserMimeType)
}
async function parseDOM(html: string, url?: string, mimeType?: DOMParserMimeType) {
await initParser()
const document = new DOMParser().parseFromString(html, mimeType ?? "text/html")!
const baseURLString = document.getElementsByTagName("base")[0]?.getAttribute("href") ?? url
if (baseURLString) {
const baseURL = new URL(baseURLString)
const nodes: Element[] = []
nodes.push(document.documentElement!)
while (nodes.length > 0) {
const element = nodes.pop()!
const href = element.getAttribute("href")
if (href) {
try {
// Try to parse absolute URL
new URL(href)
} catch {
// Assume href is a relative URL
element.setAttribute("href", new URL(href, baseURL))
}
}
nodes.push(...element.children)
}
}
return [document]
}
async function parseDOMFromFile(file: string, url: string) {
const data = await Deno.readFile(file)
return parseDOM(new TextDecoder().decode(data), url)
}
async function sanitizeHTML(html: string) {
await ammonia.init()
return ammonia.clean(html)
}
async function sanitizeDOM(document: Document) {
return await sanitizeHTML(document.documentElement!.outerHTML)
}
import readable from "./common.mjs"
await readable(
Buffer,
fs,
process,
yargs(Deno.args),
__,
Readability,
isProbablyReaderable,
printVersion,
parseDOM,
parseDOMFromFile,
parseDOMFromURL,
sanitizeDOM,
sanitizeHTML
)
if (process.exitCode) {
process.exit()
}