Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update utf8 implementation #144

Merged
merged 2 commits into from
Jun 18, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions lib/object.js
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ var dataToString = function(asUTF8) {
}

if (!asUTF8 && !this.options.binary) {
result = out.utf8encode(result);
result = utils.transformTo("string", out.utf8encode(result));
}
return result;
};
Expand Down Expand Up @@ -342,11 +342,11 @@ var generateCompressedObjectFrom = function(file, compression) {
*/
var generateZipParts = function(name, file, compressedObject, offset) {
var data = compressedObject.compressedContent,
utfEncodedFileName = this.utf8encode(file.name),
utfEncodedFileName = utils.transformTo("string", utf8.utf8encode(file.name)),
comment = file.comment || "",
utfEncodedComment = this.utf8encode(comment),
useUTF8ForFileName = utfEncodedFileName !== file.name,
useUTF8ForComment = utfEncodedComment !== comment,
utfEncodedComment = utils.transformTo("string", utf8.utf8encode(comment)),
useUTF8ForFileName = utfEncodedFileName.length !== file.name.length,
useUTF8ForComment = utfEncodedComment.length !== comment.length,
o = file.options,
dosTime,
dosDate,
Expand Down Expand Up @@ -640,7 +640,7 @@ var out = {
localDirLength = 0,
centralDirLength = 0,
writer, i,
utfEncodedComment = this.utf8encode(options.comment || this.comment || "");
utfEncodedComment = utils.transformTo("string", this.utf8encode(options.comment || this.comment || ""));

// first, generate all the zip parts.
for (var name in this.files) {
Expand Down Expand Up @@ -738,7 +738,7 @@ var out = {
* This method will be removed in a future version without replacement.
*/
utf8encode: function (string) {
return utf8.utf8encode(string);
return utils.transformTo("string", utf8.utf8encode(string));
},

/**
Expand Down
241 changes: 181 additions & 60 deletions lib/utf8.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,95 +15,216 @@ if (
}

/**
* http://www.webtoolkit.info/javascript-utf8.html
* The following functions come from pako, from pako/lib/utils/strings
* released under the MIT license, see pako https://github.com/nodeca/pako/
*/
exports.utf8encode = function utf8encode(string) {
// TextEncoder + Uint8Array to binary string is faster than checking every bytes on long strings.
// http://jsperf.com/utf8encode-vs-textencoder
// On short strings (file names for example), the TextEncoder API is (currently) slower.
if (textEncoder) {
var u8 = textEncoder.encode(string);
return utils.transformTo("string", u8);

// Table with utf8 lengths (calculated by first byte of sequence)
// Note, that 5 & 6-byte values and some 4-byte values can not be represented in JS,
// because max possible codepoint is 0x10ffff
var _utf8len = new Array(256);
for (var i=0; i<256; i++) {
_utf8len[i] = (i >= 252 ? 6 : i >= 248 ? 5 : i >= 240 ? 4 : i >= 224 ? 3 : i >= 192 ? 2 : 1);
}
_utf8len[254]=_utf8len[254]=1; // Invalid sequence start

// convert string to array (typed, when possible)
var string2buf = function (str) {
var buf, c, c2, m_pos, i, str_len = str.length, buf_len = 0;

// count binary size
for (m_pos = 0; m_pos < str_len; m_pos++) {
c = str.charCodeAt(m_pos);
if ((c & 0xfc00) === 0xd800 && (m_pos+1 < str_len)) {
c2 = str.charCodeAt(m_pos+1);
if ((c2 & 0xfc00) === 0xdc00) {
c = 0x10000 + ((c - 0xd800) << 10) + (c2 - 0xdc00);
m_pos++;
}
}
buf_len += c < 0x80 ? 1 : c < 0x800 ? 2 : c < 0x10000 ? 3 : 4;
}
if (support.nodebuffer) {
return utils.transformTo("string", nodeBuffer(string, "utf-8"));

// allocate buffer
if (support.uint8array) {
buf = new Uint8Array(buf_len);
} else {
buf = new Array(buf_len);
}

// array.join may be slower than string concatenation but generates less objects (less time spent garbage collecting).
// See also http://jsperf.com/array-direct-assignment-vs-push/31
var result = [],
resIndex = 0;
// convert
for (i=0, m_pos = 0; i < buf_len; m_pos++) {
c = str.charCodeAt(m_pos);
if ((c & 0xfc00) === 0xd800 && (m_pos+1 < str_len)) {
c2 = str.charCodeAt(m_pos+1);
if ((c2 & 0xfc00) === 0xdc00) {
c = 0x10000 + ((c - 0xd800) << 10) + (c2 - 0xdc00);
m_pos++;
}
}
if (c < 0x80) {
/* one byte */
buf[i++] = c;
} else if (c < 0x800) {
/* two bytes */
buf[i++] = 0xC0 | (c >>> 6);
buf[i++] = 0x80 | (c & 0x3f);
} else if (c < 0x10000) {
/* three bytes */
buf[i++] = 0xE0 | (c >>> 12);
buf[i++] = 0x80 | (c >>> 6 & 0x3f);
buf[i++] = 0x80 | (c & 0x3f);
} else {
/* four bytes */
buf[i++] = 0xf0 | (c >>> 18);
buf[i++] = 0x80 | (c >>> 12 & 0x3f);
buf[i++] = 0x80 | (c >>> 6 & 0x3f);
buf[i++] = 0x80 | (c & 0x3f);
}
}

for (var n = 0; n < string.length; n++) {
return buf;
};

var c = string.charCodeAt(n);
// Calculate max possible position in utf8 buffer,
// that will not break sequence. If that's not possible
// - (very small limits) return max size as is.
//
// buf[] - utf8 bytes array
// max - length limit (mandatory);
var utf8border = function(buf, max) {
var pos;

if (c < 128) {
result[resIndex++] = String.fromCharCode(c);
}
else if ((c > 127) && (c < 2048)) {
result[resIndex++] = String.fromCharCode((c >> 6) | 192);
result[resIndex++] = String.fromCharCode((c & 63) | 128);
max = max || buf.length;
if (max > buf.length) { max = buf.length; }

// go back from last position, until start of sequence found
pos = max-1;
while (pos >= 0 && (buf[pos] & 0xC0) === 0x80) { pos--; }

// Fuckup - very small and broken sequence,
// return max, because we should return something anyway.
if (pos < 0) { return max; }

// If we came to start of buffer - that means vuffer is too small,
// return max too.
if (pos === 0) { return max; }

return (pos + _utf8len[buf[pos]] > max) ? pos : max;
};

// convert array to string
var buf2string = function (buf) {
var str, i, out, c, c_len;
var len = buf.length;

// Reserve max possible length (2 words per char)
// NB: by unknown reasons, Array is significantly faster for
// String.fromCharCode.apply than Uint16Array.
var utf16buf = new Array(len*2);

for (out=0, i=0; i<len;) {
c = buf[i++];
// quick process ascii
if (c < 0x80) { utf16buf[out++] = c; continue; }

c_len = _utf8len[c];
// skip 5 & 6 byte codes
if (c_len > 4) { utf16buf[out++] = 0xfffd; i += c_len-1; continue; }

// apply mask on first byte
c &= c_len === 2 ? 0x1f : c_len === 3 ? 0x0f : 0x07;
// join the rest
while (c_len > 1 && i < len) {
c = (c << 6) | (buf[i++] & 0x3f);
c_len--;
}
else {
result[resIndex++] = String.fromCharCode((c >> 12) | 224);
result[resIndex++] = String.fromCharCode(((c >> 6) & 63) | 128);
result[resIndex++] = String.fromCharCode((c & 63) | 128);

// terminated by end of string?
if (c_len > 1) { utf16buf[out++] = 0xfffd; continue; }

if (c < 0x10000) {
utf16buf[out++] = c;
} else {
c -= 0x10000;
utf16buf[out++] = 0xd800 | ((c >> 10) & 0x3ff);
utf16buf[out++] = 0xdc00 | (c & 0x3ff);
}
}

// shrinkBuf(utf16buf, out)
if (utf16buf.length !== out) {
if(utf16buf.subarray) {
utf16buf = utf16buf.subarray(0, out);
} else {
utf16buf.length = out;
}
}

return result.join("");
// return String.fromCharCode.apply(null, utf16buf);
return utils.applyFromCharCode(utf16buf);
};


// That's all for the pako functions.


/**
* http://www.webtoolkit.info/javascript-utf8.html
* Transform a javascript string into an array (typed if possible) of bytes,
* UTF-8 encoded.
* @param {String} str the string to encode
* @return {Array|Uint8Array|Buffer} the UTF-8 encoded string.
*/
exports.utf8decode = function utf8decode(input) {
var result = [],
resIndex = 0;
var type = utils.getTypeOf(input);
var isArray = type !== "string";
var i = 0;
var c = 0,
c1 = 0,
c2 = 0,
c3 = 0;
exports.utf8encode = function utf8encode(str) {
// TextEncoder + Uint8Array to binary string is faster than checking every bytes on long strings.
// http://jsperf.com/utf8encode-vs-textencoder
// On short strings (file names for example), the TextEncoder API is (currently) slower.
if (textEncoder) {
return textEncoder.encode(str);
}
if (support.nodebuffer) {
return nodeBuffer(str, "utf-8");
}

return string2buf(str);
};


/**
* Transform a bytes array (or a representation) representing an UTF-8 encoded
* string into a javascript string.
* @param {Array|Uint8Array|Buffer} buf the data de decode
* @return {String} the decoded string.
*/
exports.utf8decode = function utf8decode(buf) {
// check if we can use the TextDecoder API
// see http://encoding.spec.whatwg.org/#api
if (textDecoder) {
return textDecoder.decode(
utils.transformTo("uint8array", input)
utils.transformTo("uint8array", buf)
);
}
if (support.nodebuffer) {
return utils.transformTo("nodebuffer", input).toString("utf-8");
return utils.transformTo("nodebuffer", buf).toString("utf-8");
}

while (i < input.length) {

c = isArray ? input[i] : input.charCodeAt(i);

if (c < 128) {
result[resIndex++] = String.fromCharCode(c);
i++;
buf = utils.transformTo(support.uint8array ? "uint8array" : "array", buf);

// return buf2string(buf);
// Chrome prefers to work with "small" chunks of data
// for the method buf2string.
// Firefox and Chrome has their own shortcut, IE doesn't seem to really care.
var result = [], k = 0, len = buf.length, chunk = 65536;
while (k < len) {
var nextBoundary = utf8border(buf, Math.min(k + chunk, len));
if (support.uint8array) {
result.push(buf2string(buf.subarray(k, nextBoundary)));
} else {
result.push(buf2string(buf.slice(k, nextBoundary)));
}
else if ((c > 191) && (c < 224)) {
c2 = isArray ? input[i + 1] : input.charCodeAt(i + 1);
result[resIndex++] = String.fromCharCode(((c & 31) << 6) | (c2 & 63));
i += 2;
}
else {
c2 = isArray ? input[i + 1] : input.charCodeAt(i + 1);
c3 = isArray ? input[i + 2] : input.charCodeAt(i + 2);
result[resIndex++] = String.fromCharCode(((c & 15) << 12) | ((c2 & 63) << 6) | (c3 & 63));
i += 3;
}

k = nextBoundary;
}

return result.join("");

};
// vim: set shiftwidth=4 softtabstop=4:
3 changes: 3 additions & 0 deletions lib/utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,9 @@ function arrayLikeToString(array) {
return result.join("");
}

exports.applyFromCharCode = arrayLikeToString;


/**
* Copy the data from an array-like to an other array-like.
* @param {Array|ArrayBuffer|Uint8Array|Buffer} arrayFrom the origin array.
Expand Down
Binary file added test/ref/pile_of_poo.zip
Binary file not shown.
22 changes: 22 additions & 0 deletions test/test.js
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,28 @@ testZipFile("Zip text file with UTF-8 characters in filename", "ref/utf8_in_name
equal(reload(actual), actual, "Generated ZIP can be parsed");
});

// zip -X -0 pile_of_poo.zip Iñtërnâtiônàlizætiøn☃💩.txt
testZipFile("Zip text file and UTF-8, Pile Of Poo test", "ref/pile_of_poo.zip", function(expected) {
var zip = new JSZip();
// this is the string "Iñtërnâtiônàlizætiøn☃💩",
// see http://mathiasbynens.be/notes/javascript-unicode
// but escaped, to avoid troubles
// thanks http://mothereff.in/js-escapes#1I%C3%B1t%C3%ABrn%C3%A2ti%C3%B4n%C3%A0liz%C3%A6ti%C3%B8n%E2%98%83%F0%9F%92%A9
var text = 'I\xF1t\xEBrn\xE2ti\xF4n\xE0liz\xE6ti\xF8n\u2603\uD83D\uDCA9';
zip.file(text + ".txt", text + "\n");
var actual = zip.generate({type:"string"});

equal(reload(actual), actual, "Generated ZIP can be parsed");

ok(new JSZip(expected).file(text + ".txt"), "JSZip finds the unicode file name on the external file");
ok(new JSZip(actual).file(text + ".txt"), "JSZip finds the unicode file name on its own file");
var textFromExpected = new JSZip(expected).file(text + ".txt").asText();
var textFromActual = new JSZip(actual).file(text + ".txt").asText();

equal(textFromExpected, text + "\n", "JSZip can decode the external file");
equal(textFromActual, text + "\n", "JSZip can decode its own file");
});

testZipFile("Zip text file with date", "ref/text.zip", function(expected) {
var zip = new JSZip();
zip.file("Hello.txt", "Hello World\n", {date : new Date("July 17, 2009 14:36:57")});
Expand Down