-
Notifications
You must be signed in to change notification settings - Fork 0
/
aegisub-encoding-detection.patch
133 lines (120 loc) · 4.5 KB
/
aegisub-encoding-detection.patch
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
From f733297499f72ea11c166f6be23751a7b002c51c Mon Sep 17 00:00:00 2001
From: wangqr <wangqr@wangqr.tk>
Date: Sat, 18 May 2019 20:10:32 -0400
Subject: [PATCH 1/3] Rewrite encoding detection
Now feeds all data to uchardet, when uchardet is available. The file
size limit is removed.
When uchardet is not available, we look for UTF-8 BOM.
This should make loading UTF-8-BOM files faster.
Because Aegisub always save file in UTF-8-BOM, this should also
guarentee Aegisub will load large (>100MB) file saved by itself.
See Aegisub/Aegisub#110
---
libaegisub/common/charset.cpp | 42 ++++++++++++++++-------------------
1 file changed, 19 insertions(+), 23 deletions(-)
diff --git a/libaegisub/common/charset.cpp b/libaegisub/common/charset.cpp
index 1addb90ee..5ca248b9e 100644
--- a/libaegisub/common/charset.cpp
+++ b/libaegisub/common/charset.cpp
@@ -29,37 +29,33 @@ namespace agi { namespace charset {
std::string Detect(agi::fs::path const& file) {
agi::read_file_mapping fp(file);
- // If it's over 100 MB it's either binary or big enough that we won't
- // be able to do anything useful with it anyway
- if (fp.size() > 100 * 1024 * 1024)
- return "binary";
-
- uint64_t binaryish = 0;
-
#ifdef WITH_UCHARDET
agi::scoped_holder<uchardet_t> ud(uchardet_new(), uchardet_delete);
for (uint64_t offset = 0; offset < fp.size(); ) {
- auto read = std::min<uint64_t>(4096, fp.size() - offset);
+ auto read = std::min<uint64_t>(65536, fp.size() - offset);
auto buf = fp.read(offset, read);
uchardet_handle_data(ud, buf, read);
- uchardet_data_end(ud);
- if (*uchardet_get_charset(ud))
- return uchardet_get_charset(ud);
-
offset += read;
-
- // A dumb heuristic to detect binary files
- for (size_t i = 0; i < read; ++i) {
- if ((unsigned char)buf[i] < 32 && (buf[i] != '\r' && buf[i] != '\n' && buf[i] != '\t'))
- ++binaryish;
- }
-
- if (binaryish > offset / 8)
- return "binary";
}
- return uchardet_get_charset(ud);
+ uchardet_data_end(ud);
+ std::string encoding = uchardet_get_charset(ud);
+ return encoding.empty() ? "binary" : encoding;
#else
- auto read = std::min<uint64_t>(4096, fp.size());
+
+ // Look for utf-8 BOM
+ if (fp.size() >= 3) {
+ const char* buf = fp.read(0, 3);
+ if (!strncmp(buf, "\xef\xbb\xbf", 3))
+ return "utf-8";
+ }
+
+ // If it's over 100 MB it's either binary or big enough that we won't
+ // be able to do anything useful with it anyway
+ if (fp.size() > 100 * 1024 * 1024)
+ return "binary";
+
+ uint64_t binaryish = 0;
+ auto read = std::min<uint64_t>(65536, fp.size());
auto buf = fp.read(0, read);
for (size_t i = 0; i < read; ++i) {
if ((unsigned char)buf[i] < 32 && (buf[i] != '\r' && buf[i] != '\n' && buf[i] != '\t'))
From d6ddea0f6577df319b5f2cbd2a54d5206693f752 Mon Sep 17 00:00:00 2001
From: wangqr <wangqr@wangqr.tk>
Date: Sun, 16 Jun 2019 19:14:10 -0400
Subject: [PATCH 2/3] Detect EBML magic number to skip encoding detection for MKV
MKV loads slow after f733297499f72ea11c166f6be23751a7b002c51c
---
libaegisub/common/charset.cpp | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/libaegisub/common/charset.cpp b/libaegisub/common/charset.cpp
index 5ca248b9e..4a9d40f47 100644
--- a/libaegisub/common/charset.cpp
+++ b/libaegisub/common/charset.cpp
@@ -29,6 +29,14 @@ namespace agi { namespace charset {
std::string Detect(agi::fs::path const& file) {
agi::read_file_mapping fp(file);
+ // FIXME: Dirty hack for Matroska. These 4 bytes are the magic
+ // number of EBML which is used by mkv and webm
+ if (fp.size() >= 4) {
+ const char* buf = fp.read(0, 4);
+ if (!strncmp(buf, "\x1a\x45\xdf\xa3", 4))
+ return "binary";
+ }
+
#ifdef WITH_UCHARDET
agi::scoped_holder<uchardet_t> ud(uchardet_new(), uchardet_delete);
for (uint64_t offset = 0; offset < fp.size(); ) {
From fbca22229524210c3c9ae0f2bb423d405065d8f0 Mon Sep 17 00:00:00 2001
From: wangqr <wangqr@wangqr.tk>
Date: Mon, 2 Sep 2019 14:25:52 -0400
Subject: [PATCH 3/3] Treat empty file as ascii
---
libaegisub/common/charset.cpp | 3 +++
1 file changed, 3 insertions(+)
diff --git a/libaegisub/common/charset.cpp b/libaegisub/common/charset.cpp
index 4a9d40f47..322430556 100644
--- a/libaegisub/common/charset.cpp
+++ b/libaegisub/common/charset.cpp
@@ -29,6 +29,9 @@ namespace agi { namespace charset {
std::string Detect(agi::fs::path const& file) {
agi::read_file_mapping fp(file);
+ // FIXME: It is an empty file. Treat as ascii
+ if (fp.size() == 0) return "ascii";
+
// FIXME: Dirty hack for Matroska. These 4 bytes are the magic
// number of EBML which is used by mkv and webm
if (fp.size() >= 4) {