-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawl.test.js
83 lines (70 loc) · 2.2 KB
/
crawl.test.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
const { test, expect } = require('@jest/globals');
const { normalizeURL, getURLsFromHTML } = require('./crawl.js');
// ---------- Tests for normalizeURL ----------
describe('Testing normalizeURL', () => {
test('Converts to lowercase', () => {
expect(normalizeURL('https://eXaMple.cOm/pAth')).toEqual(
'https://example.com/path'
);
});
test("Truncates trailing '/'", () => {
expect(normalizeURL('https://example.com/path/')).toEqual(
'https://example.com/path'
);
});
test("Truncates 'www.'", () => {
expect(normalizeURL('https://example.com/path/')).toEqual(
'https://example.com/path'
);
expect(normalizeURL('https://www.example.com/path/')).toEqual(
'https://example.com/path'
);
});
test('Preserves other subdomains', () => {
expect(normalizeURL('https://subdomain.example.com/path')).toEqual(
'https://subdomain.example.com/path'
);
});
test('Preserves ports', () => {
expect(normalizeURL('https://example.com:8080/path/')).toEqual(
'https://example.com:8080/path'
);
});
test('Preserves query params and hashes', () => {
expect(
normalizeURL(
'https://www.example.com:8080/path/to/page?param1=value1¶m2=value2#section1'
)
).toEqual(
'https://example.com:8080/path/to/page?param1=value1¶m2=value2#section1'
);
});
});
// ---------- Tests for getURLsFromHTML ----------
describe('getURLsFromHTML converts relative URLs to absolute', () => {
const htmlString = `<a href="https://www.example.com/path/to/page">Link 1</a>
<a href="/path/to/page">Link 2</a>
<a href="/path/to/page?param1=value1¶m2=value2#section1">Link 3</a>`;
const testMessages = [
'Absolute Path',
'Relative path',
'Relative path with query and hashes'
];
const links = getURLsFromHTML(htmlString, 'https://www.example.com');
links.forEach((link, index) => {
test(testMessages[index], () => {
console.log(link);
expect(link).toMatch(/^[https?:\/\/|www\.].+(\..+)+[^\/]$/);
});
});
test('Handles errors for invalid URLs', () => {
const inputBody = `<html>
<body>
<a href="path/one">
<span>Boot.dev></span>
</a>
</body>
</html>`;
expect(getURLsFromHTML(inputBody, 'https://blog.boot.dev')).toEqual([]);
});
});