fs: improve promise based readFile performance for big files

This significantly reduces the peak memory for the promise based readFile operation by reusing a single memory chunk after each read and strinigifying that chunk immediately. Signed-off-by: Ruben Bridgewater <ruben@bridgewater.de>
nodejs · Aug 20, 2022 · e72802b · e72802b
1 parent 5e57d24
commit e72802b
Show file tree

Hide file tree

Showing 3 changed files with 66 additions and 38 deletions.
diff --git a/benchmark/fs/readfile-promises.js b/benchmark/fs/readfile-promises.js
@@ -15,11 +15,19 @@ const filename = path.resolve(tmpdir.path,
 
 const bench = common.createBenchmark(main, {
   duration: [5],
-  len: [1024, 16 * 1024 * 1024],
+  len: [
+    1024,
+    512 * 1024,
+    4 * 1024 ** 2,
+    8 * 1024 ** 2,
+    16 * 1024 ** 2,
+    32 * 1024 ** 2,
+  ],
+  encoding: ['', 'utf8'],
   concurrent: [1, 10]
 });
 
-function main({ len, duration, concurrent }) {
+function main({ len, duration, encoding, concurrent }) {
   try {
     fs.unlinkSync(filename);
   } catch {
@@ -44,7 +52,7 @@ function main({ len, duration, concurrent }) {
   }, duration * 1000);
 
   function read() {
-    fs.promises.readFile(filename)
+    fs.promises.readFile(filename, { encoding })
       .then((res) => afterRead(undefined, res))
       .catch((err) => afterRead(err));
   }

diff --git a/lib/fs.js b/lib/fs.js
@@ -343,6 +343,9 @@ function readFileAfterStat(err, stats) {
   if (err)
     return context.close(err);
 
+  // TODO(BridgeAR): Check if allocating a smaller chunk is better performance
+  // wise, similar to the promise based version (less peak memory and chunked
+  // stringify operations vs multiple C++/JS boundary crossings).
   const size = context.size = isFileType(stats, S_IFREG) ? stats[8] : 0;
 
   if (size > kIoMaxLength) {
@@ -352,6 +355,9 @@ function readFileAfterStat(err, stats) {
 
   try {
     if (size === 0) {
+      // TODO(BridgeAR): We are able to optimize this in case an encoding is used. If
+      // that's the case, let's use the StringDecoder and directly concat the
+      // result and to reuse the former chunk instead of allocating a new one.
       context.buffers = [];
     } else {
       context.buffer = Buffer.allocUnsafeSlow(size);

diff --git a/lib/internal/fs/promises.js b/lib/internal/fs/promises.js
@@ -86,6 +86,7 @@ const {
   promisify,
 } = require('internal/util');
 const { EventEmitterMixin } = require('internal/event_target');
+const { StringDecoder } = require('string_decoder');
 const { watch } = require('internal/fs/watchers');
 const { isIterable } = require('internal/streams/utils');
 const assert = require('internal/assert');
@@ -416,63 +417,76 @@ async function writeFileHandle(filehandle, data, signal, encoding) {
 
 async function readFileHandle(filehandle, options) {
   const signal = options?.signal;
+  const encoding = options?.encoding;
+  const decoder = encoding && new StringDecoder(encoding);
 
   checkAborted(signal);
 
   const statFields = await binding.fstat(filehandle.fd, false, kUsePromises);
 
   checkAborted(signal);
 
-  let size;
+  let size = 0;
+  let length = 0;
   if ((statFields[1/* mode */] & S_IFMT) === S_IFREG) {
     size = statFields[8/* size */];
-  } else {
-    size = 0;
+    length = encoding ? MathMin(size, kReadFileBufferLength) : size;
+  }
+  if (length === 0) {
+    length = kReadFileUnknownBufferLength;
   }
 
   if (size > kIoMaxLength)
     throw new ERR_FS_FILE_TOO_LARGE(size);
 
-  let endOfFile = false;
   let totalRead = 0;
-  const noSize = size === 0;
-  const buffers = [];
-  const fullBuffer = noSize ? undefined : Buffer.allocUnsafeSlow(size);
-  do {
+  let buffer = Buffer.allocUnsafeSlow(length);
+  let result = '';
+  let offset = 0;
+  let buffers;
+
+  while (true) {
     checkAborted(signal);
-    let buffer;
-    let offset;
-    let length;
-    if (noSize) {
-      buffer = Buffer.allocUnsafeSlow(kReadFileUnknownBufferLength);
-      offset = 0;
-      length = kReadFileUnknownBufferLength;
-    } else {
-      buffer = fullBuffer;
-      offset = totalRead;
-      length = MathMin(size - totalRead, kReadFileBufferLength);
-    }
 
     const bytesRead = (await binding.read(filehandle.fd, buffer, offset,
-                                          length, -1, kUsePromises)) || 0;
+                                          length, -1, kUsePromises)) ?? 0;
     totalRead += bytesRead;
-    endOfFile = bytesRead === 0 || totalRead === size;
-    if (noSize && bytesRead > 0) {
-      const isBufferFull = bytesRead === kReadFileUnknownBufferLength;
-      const chunkBuffer = isBufferFull ? buffer : buffer.slice(0, bytesRead);
-      ArrayPrototypePush(buffers, chunkBuffer);
+
+    if (bytesRead === 0 || totalRead === size || bytesRead !== buffer.length) {
+      const singleRead = bytesRead === totalRead;
+
+      if (bytesRead !== buffer.length) {
+        buffer = buffer.slice(0, bytesRead);
+      }
+
+      if (!encoding) {
+        if (size === 0 && !singleRead) {
+          ArrayPrototypePush(buffers, buffer);
+          return Buffer.concat(buffers, totalRead);
+        }
+        return buffer;
+      }
+
+      if (singleRead) {
+        return buffer.toString(encoding);
+      }
+      result += decoder.end(buffer);
+      return result;
     }
-  } while (!endOfFile);
 
-  let result;
-  if (size > 0) {
-    result = totalRead === size ? fullBuffer : fullBuffer.slice(0, totalRead);
-  } else {
-    result = buffers.length === 1 ? buffers[0] : Buffer.concat(buffers,
-                                                               totalRead);
+    if (encoding) {
+      result += decoder.write(buffer);
+    } else if (size !== 0) {
+      // TODO(BridgeAR): This condition needs a test. A file should be written
+      // that is chunked without encoding.
+      offset += bytesRead;
+    } else {
+      buffers ??= [];
+      // Unknown file size requires chunks.
+      ArrayPrototypePush(buffers, buffer);
+      buffer = Buffer.allocUnsafeSlow(kReadFileUnknownBufferLength);
+    }
   }
-
-  return options.encoding ? result.toString(options.encoding) : result;
 }
 
 // All of the functions are defined as async in order to ensure that errors