Merge branch 'no-zip64'

Touffy · May 8, 2023 · 8625ddc · 8625ddc
2 parents aa8e70b + 59a4cfe
commit 8625ddc
Show file tree

Hide file tree

Showing 9 changed files with 129 additions and 40 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -2,9 +2,9 @@ name: Test
 
 on:
   push:
-    branches: [master]
+    branches: [master, no-zip64]
   pull_request:
-    branches: [master]
+    branches: [master, no-zip64]
 
 jobs:
   test:

diff --git a/README.md b/README.md
@@ -63,7 +63,7 @@ The module exports three functions:
 ```typescript
 function downloadZip(files: ForAwaitable<InputTypes>, options?: Options): Response
 
-function makeZip(files: ForAwaitable<InputTypes>): ReadableStream
+function makeZip(files: ForAwaitable<InputTypes>, options?: Options): ReadableStream
 
 function predictLength(metadata: Iterable<MetadataTypes>): bigint
 ```
@@ -76,7 +76,7 @@ function predictLength(metadata: Iterable<MetadataTypes>): bigint
   - `lastModified`: last modification date of the file (defaults to `new Date()` unless the input is a File or Response with a valid "Last-Modified" header)
   - `input`: something that contains your data; it can be a `File`, a `Blob`, a `Response`, some kind of `ArrayView` or a raw `ArrayBuffer`, a `ReadableStream<Uint8Array>` (yes, only Uint8Arrays, but most APIs give you just that type anyway), an `AsyncIterable<ArrayBuffer | ArrayView | string>`, … or just a string.
 
-The *options* argument currently supports two properties, `length` and `metadata` (see [Content-Length prediction](#content-length-prediction) just below).
+The *options* argument currently supports three properties, `length`, `metadata` (see [Content-Length prediction](#content-length-prediction)) and `buffersAreUTF8` (see [Filename encoding](#filename-encoding)).
 
 The function returns a `Response` immediately. You don't need to wait for the whole ZIP to be ready. It's up to you if you want to pipe the Response somewhere (e.g. if you are using `client-zip` inside a ServiceWorker) or let the browser buffer it all in a Blob.
 
@@ -105,6 +105,18 @@ This iterable of metadata can be passed as the `metadata` property of `downloadZ
 
 In the case of `predictLength`, you can even save the return value and pass it later to `downloadZip` as the `length` option, instead of repeating the `metadata`.
 
+## Filename encoding
+
+(tl;dr: set `buffersAreUTF8: true` in the *options* argument)
+
+In ZIP archives, the *language encoding flag* indicates that a filename is encoded in UTF-8. Some ZIP archive programs (e.g. build-in ZIP archive viewer in Windows) might not decode UTF-8 filenames correctly if this flag is off.
+
+`client-zip` always encodes **string** filenames (including filenames extracted from URLs) as UTF-8 and sets this flag for the related entries. However, `downloadZip`'s *options* include a `buffersAreUTF8` setting, affecting filenames that you supply as an **ArrayBuffer** (or ArrayView).
+
+By default (when `buffersAreUTF8` is not set or `undefined`), each ArrayBuffer filename will be tested, and flagged only if it is valid UTF-8. It is a safe default, but a little inefficient because UTF-8 is the only thing you can get in most contexts anyway. So you may tell client-zip to skip the test by setting `buffersAreUTF8: true` ; ArrayBuffers will *always* be flagged as UTF-8 without checking.
+
+<small>If you happen to get your filenames from a dusty API reading from an antique filesystem with non-ASCII filenames encoded in some retro 8-bit encoding and you want to keep them that way in the ZIP archive, you may set `buffersAreUTF8: false` ; ArrayBuffer filenames will *never* be flagged as UTF-8. Please beware that the stored filenames will extract correctly only with a ZIP program using the same system encoding as the source.</small>
+
 # Benchmarks
 
 *updated in may 2023*
@@ -138,7 +150,7 @@ In a different experiment using Deno to avoid storing very large output files, m
 
 Now, comparing bundle size is clearly unfair because the others do a bunch of things that my library doesn't. Here you go anyway (sizes are shown in decimal kilobytes):
 
-|                    | `client-zip`@2.4.0 | fflate@0.7.4 | zip.js@2.7.6 | conflux@4.0.3 | JSZip@3.10.1  |
+|                    | `client-zip`@2.4.2 | fflate@0.7.4 | zip.js@2.7.6 | conflux@4.0.3 | JSZip@3.10.1  |
 |--------------------|-------------------:|-------------:|--------------:|--------------:|--------------:|
 | minified           |             5.7 kB |      29.8 kB |      162.3 kB |      198.8 kB |       94.9 kB |
 | minified + gzipped |             2.2 kB |        11 kB |       57.8 kB |       56.6 kB |       27.6 kB |

diff --git a/index.d.ts b/index.d.ts
@@ -26,6 +26,12 @@ type InputFolder = { name: any, lastModified?: any, input?: never, size?: never
   /** If provided, the returned Response will have its `Content-Length` header set to the result of
   * calling `predictLength` on that metadata. Overrides the `length` option. */
   metadata?: Iterable<InputWithMeta | InputWithSizeMeta | JustMeta>
+  /** The ZIP *language encoding flag* will always be set when a filename was given as a string,
+   * but when it is given as an ArrayView or ArrayBuffer, it depends on this option :
+   * - `true`: always on (ArrayBuffers will *always* be flagged as UTF-8) — recommended,
+   * - `false`: always off (ArrayBuffers will *never* be flagged as UTF-8),
+   * - `undefined`: each ArrayBuffer will be tested and flagged if it is valid UTF-8. */
+  buffersAreUTF8?: boolean
 } 
 
 /** Given an iterable of file metadata (or equivalent),

diff --git a/src/index.ts b/src/index.ts
@@ -19,13 +19,19 @@ type InputFolder = { name: any, lastModified?: any, input?: never, size?: never
 /** Both filename and size must be provided ; input is not helpful here. */
 type JustMeta = { input?: StreamLike | undefined, name: any, lastModified?: any, size: number | bigint }
 
-type Options = {
+export type Options = {
   /** If provided, the returned Response will have its `Content-Length` header set to this value.
    * It can be computed accurately with the `predictLength` function. */
   length?: number | bigint
   /** If provided, the returned Response will have its `Content-Length` header set to the result of
    * calling `predictLength` on that metadata. Overrides the `length` option. */
   metadata?: Iterable<InputWithMeta | InputWithSizeMeta | JustMeta>
+  /** The ZIP *language encoding flag* will always be set when a filename was given as a string,
+   * but when it is given as an ArrayView or ArrayBuffer, it depends on this option :
+   * - `true`: always on (ArrayBuffers will *always* be flagged as UTF-8) — recommended,
+   * - `false`: always off (ArrayBuffers will *never* be flagged as UTF-8),
+   * - `undefined`: each ArrayBuffer will be tested and flagged if it is valid UTF-8. */
+  buffersAreUTF8?: boolean
 }
 
 function normalizeArgs(file: InputWithMeta | InputWithSizeMeta | InputWithoutMeta | InputFolder | JustMeta) {
@@ -57,9 +63,9 @@ export function downloadZip(files: ForAwaitable<InputWithMeta | InputWithSizeMet
   const headers: Record<string, any> = { "Content-Type": "application/zip", "Content-Disposition": "attachment" }
   if ((typeof options.length === "bigint" || Number.isInteger(options.length)) && options.length! > 0) headers["Content-Length"] = String(options.length)
   if (options.metadata) headers["Content-Length"] = String(predictLength(options.metadata))
-  return new Response(makeZip(files), { headers })
+  return new Response(makeZip(files, options), { headers })
 }
 
-export function makeZip(files: ForAwaitable<InputWithMeta | InputWithSizeMeta | InputWithoutMeta | InputFolder>) {
-  return ReadableFromIter(loadFiles(mapFiles(files)));
+export function makeZip(files: ForAwaitable<InputWithMeta | InputWithSizeMeta | InputWithoutMeta | InputFolder>, options: Options = {}) {
+  return ReadableFromIter(loadFiles(mapFiles(files), options));
 }
diff --git a/src/metadata.ts b/src/metadata.ts
@@ -1,22 +1,23 @@
-import { encodeString } from "./utils.ts"
+import { encodeString, makeUint8Array } from "./utils.ts"
 import type { BufferLike, StreamLike } from "./input.ts"
 
 export type Metadata = {
   encodedName: Uint8Array
   uncompressedSize?: bigint
+  /** keep track of whether the filename was supplied as a Buffer-like object */
+  nameIsBuffer: boolean
 }
 
 /** The file name and modification date will be read from the input if it is a File or Response;
  * extra arguments can be given to override the input's metadata.
  * For other types of input, the `name` is required and `modDate` will default to *now*.
- * @param encodedName will be coerced to string, so… whatever
+ * @param name will be coerced, so… whatever, but Uint8Array or string would be nice
  */
-export function normalizeMetadata(input?: File | Response | BufferLike | StreamLike, encodedName?: any, size?: number | bigint): Metadata {
-  if (encodedName !== undefined && !(encodedName instanceof Uint8Array)) encodedName = encodeString(encodedName)
+export function normalizeMetadata(input?: File | Response | BufferLike | StreamLike, name?: unknown, size?: number | bigint): Metadata {
+  let [encodedName, nameIsBuffer] = normalizeName(name)
 
   if (input instanceof File) return {
-    encodedName: fixFilename(encodedName || encodeString(input.name)),
-    uncompressedSize: BigInt(input.size)
+    encodedName: fixFilename(encodedName || encodeString(input.name)), uncompressedSize: BigInt(input.size), nameIsBuffer
   }
   if (input instanceof Response) {
     const contentDisposition = input.headers.get("content-disposition")
@@ -25,14 +26,14 @@ export function normalizeMetadata(input?: File | Response | BufferLike | StreamL
     const decoded = urlName && decodeURIComponent(urlName)
     // @ts-ignore allow coercion from null to zero
     const length = size || +input.headers.get('content-length')
-    return { encodedName: fixFilename(encodedName || encodeString(decoded)), uncompressedSize: BigInt(length) }
+    return { encodedName: fixFilename(encodedName || encodeString(decoded)), uncompressedSize: BigInt(length), nameIsBuffer }
   }
   encodedName = fixFilename(encodedName, input !== undefined || size !== undefined)
-  if (typeof input === "string") return { encodedName, uncompressedSize: BigInt(encodeString(input).length) }
-  if (input instanceof Blob) return { encodedName, uncompressedSize: BigInt(input.size) }
-  if (input instanceof ArrayBuffer || ArrayBuffer.isView(input)) return { encodedName, uncompressedSize: BigInt(input.byteLength) }
+  if (typeof input === "string") return { encodedName, uncompressedSize: BigInt(encodeString(input).length), nameIsBuffer }
+  if (input instanceof Blob) return { encodedName, uncompressedSize: BigInt(input.size), nameIsBuffer }
+  if (input instanceof ArrayBuffer || ArrayBuffer.isView(input)) return { encodedName, uncompressedSize: BigInt(input.byteLength), nameIsBuffer }
   // @ts-ignore
-  return { encodedName, uncompressedSize: getUncompressedSize(input, size) }
+  return { encodedName, uncompressedSize: getUncompressedSize(input, size), nameIsBuffer }
 }
 
 function getUncompressedSize(input: any, size: number | bigint) {
@@ -42,6 +43,13 @@ function getUncompressedSize(input: any, size: number | bigint) {
   return input ? undefined : 0n;
 }
 
+function normalizeName(name: unknown): [encodedName: Uint8Array | undefined, nameIsBuffer: boolean] {
+  if (!name) return [undefined, false]
+  if (name instanceof Uint8Array) return [name, true]
+  if (ArrayBuffer.isView(name) || name instanceof ArrayBuffer) return [makeUint8Array(name), true]
+  return [encodeString(name), false]
+}
+
 function fixFilename(encodedName: Uint8Array | undefined, isFile = true) {
   if (!encodedName || encodedName.every(c => c === 47)) throw new Error("The file must have a name.")
   // remove trailing slashes in files

diff --git a/src/zip.ts b/src/zip.ts
@@ -3,6 +3,7 @@ import { crc32 } from "./crc32.ts"
 import { formatDOSDateTime } from "./datetime.ts"
 import type { ZipEntryDescription, ZipFileDescription } from "./input.ts"
 import { Metadata } from "./metadata.ts"
+import { Options } from "./index.ts"
 
 const fileHeaderSignature = 0x504b_0304, fileHeaderLength = 30
 const descriptorSignature = 0x504b_0708, descriptorLength = 16
@@ -15,7 +16,7 @@ export type ForAwaitable<T> = AsyncIterable<T> | Iterable<T>
 
 type Zip64FieldLength = 0 | 12 | 28
 
-export function contentLength(files: Iterable<Metadata>) {
+export function contentLength(files: Iterable<Omit<Metadata, 'nameIsBuffer'>>) {
   let centralLength = BigInt(endLength)
   let offset = 0n
   let archiveNeedsZip64 = false
@@ -36,15 +37,27 @@ export function contentLength(files: Iterable<Metadata>) {
   return centralLength + offset
 }
 
-export async function* loadFiles(files: ForAwaitable<ZipEntryDescription & Metadata>) {
+export function flagNameUTF8({encodedName, nameIsBuffer}: Metadata, buffersAreUTF8?: boolean) {
+  // @ts-ignore
+  return (!nameIsBuffer || (buffersAreUTF8 ?? tryUTF8(encodedName))) * 0b1000
+}
+const UTF8Decoder = new TextDecoder('utf8', { fatal: true })
+function tryUTF8(str: Uint8Array) {
+  try { UTF8Decoder.decode(str) }
+  catch { return false }
+  return true
+}
+
+export async function* loadFiles(files: ForAwaitable<ZipEntryDescription & Metadata>, options: Options) {
   const centralRecord: Uint8Array[] = []
   let offset = 0n
   let fileCount = 0n
   let archiveNeedsZip64 = false
 
   // write files
   for await (const file of files) {
-    yield fileHeader(file)
+    const flags = flagNameUTF8(file, options.buffersAreUTF8)
+    yield fileHeader(file, flags)
     yield file.encodedName
     if (file.isFile) {
       yield* fileData(file)
@@ -55,7 +68,7 @@ export async function* loadFiles(files: ForAwaitable<ZipEntryDescription & Metad
     const zip64HeaderLength = (bigOffset * 12 | bigFile * 28) as Zip64FieldLength
     yield dataDescriptor(file, bigFile)
 
-    centralRecord.push(centralHeader(file, offset, zip64HeaderLength))
+    centralRecord.push(centralHeader(file, offset, flags, zip64HeaderLength))
     centralRecord.push(file.encodedName)
     if (zip64HeaderLength) centralRecord.push(zip64ExtraField(file, offset, zip64HeaderLength))
     if (bigFile) offset += 8n // because the data descriptor will have 64-bit sizes
@@ -102,10 +115,10 @@ export async function* loadFiles(files: ForAwaitable<ZipEntryDescription & Metad
   yield makeUint8Array(end)
 }
 
-export function fileHeader(file: ZipEntryDescription & Metadata) {
+export function fileHeader(file: ZipEntryDescription & Metadata, flags = 0) {
   const header = makeBuffer(fileHeaderLength)
   header.setUint32(0, fileHeaderSignature)
-  header.setUint32(4, 0x2d_00_0800) // ZIP version 4.5 | flags, bit 3 on = size and CRCs will be zero
+  header.setUint32(4, 0x2d_00_0800 | flags) // ZIP version 4.5 | flags, bit 3 on = size and CRCs will be zero
   // leave compression = zero (2 bytes) until we implement compression
   formatDOSDateTime(file.modDate, header, 10)
   // leave CRC = zero (4 bytes) because we'll write it later, in the central repo
@@ -149,11 +162,11 @@ export function dataDescriptor(file: ZipEntryDescription & Metadata, needsZip64:
   return makeUint8Array(header)
 }
 
-export function centralHeader(file: ZipEntryDescription & Metadata, offset: bigint, zip64HeaderLength: Zip64FieldLength = 0) {
+export function centralHeader(file: ZipEntryDescription & Metadata, offset: bigint, flags = 0, zip64HeaderLength: Zip64FieldLength = 0) {
   const header = makeBuffer(centralHeaderLength)
   header.setUint32(0, centralHeaderSignature)
   header.setUint32(4, 0x2d03_2d_00) // UNIX app version 4.5 | ZIP version 4.5
-  header.setUint16(8, 0x0800) // flags, bit 3 on
+  header.setUint16(8, 0x0800 | flags) // flags, bit 3 on
   // leave compression = zero (2 bytes) until we implement compression
   formatDOSDateTime(file.modDate, header, 12)
   header.setUint32(16, file.isFile ? file.crc! : 0, true)

diff --git a/terser.json b/terser.json
@@ -4,7 +4,7 @@
   "mangle": {
     "reserved": ["m", "c"],
     "properties": {
-      "regex": "^crc$|^uncompressedSize$|^modDate$|^bytes$|^encodedName$"
+      "regex": "^crc$|^uncompressedSize$|^modDate$|^bytes$|^encodedName$|^nameIsBuffer$"
     }
   }
 }
diff --git a/test/metadata.test.ts b/test/metadata.test.ts
@@ -16,7 +16,7 @@ Deno.test("normalizeMetadata guesses filename from Content-Disposition", () => {
   const metadata = normalizeMetadata(new Response("four", {
     headers: { "content-disposition": "attachment; filename=test.txt" }
   }))
-  assertEquals(metadata, { uncompressedSize: 0n, encodedName })
+  assertEquals(metadata, { uncompressedSize: 0n, encodedName, nameIsBuffer: false })
 })
 
 Deno.test("normalizeMetadata guesses filename from a Response URL", () => {
@@ -25,7 +25,7 @@ Deno.test("normalizeMetadata guesses filename from a Response URL", () => {
     headers: { get() { return new Headers() } }
   })
   const metadata = normalizeMetadata(response)
-  assertEquals(metadata, { uncompressedSize: 0n, encodedName })
+  assertEquals(metadata, { uncompressedSize: 0n, encodedName, nameIsBuffer: false })
 })
 
 Deno.test("normalizeMetadata guesses filename from a Response URL with trailing slash", () => {
@@ -34,24 +34,24 @@ Deno.test("normalizeMetadata guesses filename from a Response URL with trailing
     headers: { get() { return new Headers() } }
   })
   const metadata = normalizeMetadata(response)
-  assertEquals(metadata, { uncompressedSize: 0n, encodedName })
+  assertEquals(metadata, { uncompressedSize: 0n, encodedName, nameIsBuffer: false })
 })
 
 /**************************************   Files   **************************************/
 
 Deno.test("normalizeMetadata reads filename and size from a File", () => {
   const metadata = normalizeMetadata(new File(["four"], "test.txt"))
-  assertEquals(metadata, { uncompressedSize: 4n, encodedName })
+  assertEquals(metadata, { uncompressedSize: 4n, encodedName, nameIsBuffer: false })
 })
 
 /**************************************  Folders  **************************************/
 
 Deno.test("normalizeMetadata fixes trailing slashes in folder names", () => {
   const metadata = normalizeMetadata(undefined, new TextEncoder().encode("root/folder"))
-  assertEquals(metadata, { uncompressedSize: 0n, encodedName: encodedFolderName })
+  assertEquals(metadata, { uncompressedSize: 0n, encodedName: encodedFolderName, nameIsBuffer: true })
 })
 
 Deno.test("normalizeMetadata fixes trailing slashes in file names", () => {
   const metadata = normalizeMetadata(undefined, encodedFolderName, 0n)
-  assertEquals(metadata, { uncompressedSize: 0n, encodedName: new TextEncoder().encode("root/folder") })
+  assertEquals(metadata, { uncompressedSize: 0n, encodedName: new TextEncoder().encode("root/folder"), nameIsBuffer: true })
 })