From d4d400217b0e000b8262febd1ca8dd2f6a2d9783 Mon Sep 17 00:00:00 2001
From: pedr <pedrlz.frn@gmail.com>
Date: Mon, 22 Jan 2024 11:46:18 -0300
Subject: [PATCH] API: Increase protection of files downloaded via the REST API
 (#9676)

---
 .../lib/services/rest/routes/notes.test.ts    | 95 ++++++++++++++-----
 packages/lib/services/rest/routes/notes.ts    | 82 ++++++++++------
 2 files changed, 126 insertions(+), 51 deletions(-)

diff --git a/packages/lib/services/rest/routes/notes.test.ts b/packages/lib/services/rest/routes/notes.test.ts
index 6e49eb11a74..05471976bd5 100644
--- a/packages/lib/services/rest/routes/notes.test.ts
+++ b/packages/lib/services/rest/routes/notes.test.ts
@@ -1,7 +1,12 @@
 import Logger from '@joplin/utils/Logger';
 import shim from '../../../shim';
-import uuid from '../../../uuid';
 import { downloadMediaFile } from './notes';
+import Setting from '../../../models/Setting';
+import { readFile, readdir, remove, writeFile } from 'fs-extra';
+const md5 = require('md5');
+
+const imagePath = `${__dirname}/../../../images/SideMenuHeader.png`;
+const jpgBase64Content = '/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAMCAgICAgMCAgIDAwMDBAYEBAQEBAgGBgUGCQgKCgkICQkKDA8MCgsOCwkJDRENDg8QEBEQCgwSExIQEw8QEBD/wAALCAAFAAUBAREA/8QAFAABAAAAAAAAAAAAAAAAAAAACf/EAB8QAAEEAQUBAAAAAAAAAAAAAAQBAgUGAwAREiExM//aAAgBAQAAPwBJarVpGHm7KWbapCSwyZ6FDjkLyYE1W/LHyV2zfOk2TrzX/9k=';
 
 describe('routes/notes', () => {
 
@@ -21,44 +26,50 @@ describe('routes/notes', () => {
 		'https://joplinapp.org/valid/image_url.png',
 		'http://joplinapp.org/valid/image_url.png',
 	])('should try to download and return a local path to a valid URL', async (url) => {
-		const fetchBlobSpy = jest.fn();
-		jest.spyOn(shim, 'fetchBlob').mockImplementation(fetchBlobSpy);
-		jest.spyOn(uuid, 'create').mockReturnValue('mocked_uuid_value');
+		const fetchBlobSpy = jest.fn(async (_url, options) => {
+			await writeFile(options.path, Buffer.from(jpgBase64Content, 'base64'));
+		});
+		const spy = jest.spyOn(shim, 'fetchBlob').mockImplementation(fetchBlobSpy);
 
 		const response = await downloadMediaFile(url);
 
-		expect(response.endsWith('mocked_uuid_value.png')).toBe(true);
-		expect(fetchBlobSpy).toBeCalledTimes(1);
+		const files = await readdir(Setting.value('tempDir'));
+
+		expect(files.length).toBe(1);
+		expect(fetchBlobSpy).toHaveBeenCalledTimes(1);
+		expect(response).toBe(`${Setting.value('tempDir')}/${files[0]}`);
+		await remove(response);
+		spy.mockRestore();
 	});
 
 	test('should get file from local drive if protocol allows it', async () => {
-		const url = 'file://valid/image.png';
+		const url = `file:///${imagePath}`;
+		const originalFileContent = await readFile(imagePath);
 
-		const fsDriverCopySpy = jest.fn();
-		jest.spyOn(shim, 'fsDriver').mockImplementation(() => {
-			return {
-				copy: fsDriverCopySpy,
-			} as any;
-		});
-		jest.spyOn(uuid, 'create').mockReturnValue('mocked_uuid_value');
+		const response = await downloadMediaFile(url, null, ['file:']);
 
-		const response = await downloadMediaFile(url);
+		const files = await readdir(Setting.value('tempDir'));
+		expect(files.length).toBe(1);
+		expect(response).toBe(`${Setting.value('tempDir')}/${files[0]}`);
 
-		expect(response.endsWith('mocked_uuid_value.png')).toBe(true);
-		expect(fsDriverCopySpy).toBeCalledTimes(1);
+		const responseFileContent = await readFile(response);
+		expect(md5(responseFileContent)).toBe(md5(originalFileContent));
+		await remove(response);
 	});
 
 	test('should be able to handle URLs with data', async () => {
 		const url = 'data:image/gif;base64,R0lGODlhEAAQAMQAAORHHOVSKudfOulrSOp3WOyDZu6QdvCchPGolfO0o/XBs/fNwfjZ0frl3/zy7////wAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACH5BAkAABAALAAAAAAQABAAAAVVICSOZGlCQAosJ6mu7fiyZeKqNKToQGDsM8hBADgUXoGAiqhSvp5QAnQKGIgUhwFUYLCVDFCrKUE1lBavAViFIDlTImbKC5Gm2hB0SlBCBMQiB0UjIQA7';
-
-		const imageFromDataUrlSpy = jest.fn();
-		jest.spyOn(shim, 'imageFromDataUrl').mockImplementation(imageFromDataUrlSpy);
-		jest.spyOn(uuid, 'create').mockReturnValue('mocked_uuid_value');
+		const originalFileContent = Buffer.from(url.split('data:image/gif;base64,')[1], 'base64');
 
 		const response = await downloadMediaFile(url);
 
-		expect(response.endsWith('mocked_uuid_value.gif')).toBe(true);
-		expect(imageFromDataUrlSpy).toBeCalledTimes(1);
+		const files = await readdir(Setting.value('tempDir'));
+		expect(files.length).toBe(1);
+		expect(response).toBe(`${Setting.value('tempDir')}/${files[0]}`);
+
+		const responseFileContent = await readFile(response);
+		expect(md5(responseFileContent)).toBe(md5(originalFileContent));
+		await remove(response);
 	});
 
 	test('should not process URLs with data that is not image type', async () => {
@@ -68,6 +79,8 @@ describe('routes/notes', () => {
 		const response = await downloadMediaFile(url);
 		Logger.globalLogger.enabled = true;
 
+		const files = await readdir(Setting.value('tempDir'));
+		expect(files.length).toBe(0);
 		expect(response).toBe('');
 	});
 
@@ -76,6 +89,42 @@ describe('routes/notes', () => {
 
 		const response = await downloadMediaFile(url);
 
+		const files = await readdir(Setting.value('tempDir'));
+		expect(files.length).toBe(0);
 		expect(response).toBe('');
 	});
+
+	test('should not copy content from invalid protocols', async () => {
+		const url = 'file:///home/user/file.db';
+
+		const allowedProtocols: string[] = [];
+		const mediaFilePath = await downloadMediaFile(url, null, allowedProtocols);
+
+		const files = await readdir(Setting.value('tempDir'));
+		expect(files.length).toBe(0);
+		expect(mediaFilePath).toBe('');
+	});
+
+	test.each([
+		'https://joplinapp.org/valid/image_url',
+		'https://joplinapp.org/valid/image_url.invalid_url',
+	])('should correct the file extension in filename from files without or invalid ones', async (url) => {
+		const spy = jest.spyOn(shim, 'fetchBlob').mockImplementation(async (_url, options) => {
+			await writeFile(options.path, Buffer.from(jpgBase64Content, 'base64'));
+			return {
+				headers: {
+					'content-type': 'image/jpg',
+				},
+			};
+		});
+
+		const response = await downloadMediaFile(url);
+
+		const files = await readdir(Setting.value('tempDir'));
+		expect(files.length).toBe(1);
+		expect(response).toBe(`${Setting.value('tempDir')}/${files[0]}`);
+
+		await remove(response);
+		spy.mockRestore();
+	});
 });
diff --git a/packages/lib/services/rest/routes/notes.ts b/packages/lib/services/rest/routes/notes.ts
index 35346890509..ca375a0a1d1 100644
--- a/packages/lib/services/rest/routes/notes.ts
+++ b/packages/lib/services/rest/routes/notes.ts
@@ -189,67 +189,86 @@ async function tryToGuessExtFromMimeType(response: any, mediaPath: string) {
 	return newMediaPath;
 }
 
-export async function downloadMediaFile(url: string, fetchOptions?: FetchOptions) {
-	logger.info('Downloading media file', url);
 
-	const tempDir = Setting.value('tempDir');
+const getFileExtension = (url: string, isDataUrl: boolean) => {
+	let fileExt = isDataUrl ? mimeUtils.toFileExtension(mimeUtils.fromDataUrl(url)) : safeFileExtension(fileExtension(url).toLowerCase());
+	if (!mimeUtils.fromFileExtension(fileExt)) fileExt = ''; // If the file extension is unknown - clear it.
+	if (fileExt) fileExt = `.${fileExt}`;
 
-	// The URL we get to download have been extracted from the Markdown document
-	url = markdownUtils.unescapeLinkUrl(url);
+	return fileExt;
+};
 
-	const isDataUrl = url && url.toLowerCase().indexOf('data:') === 0;
+const generateMediaPath = (url: string, isDataUrl: boolean, fileExt: string) => {
+	const tempDir = Setting.value('tempDir');
+	const name = isDataUrl ? md5(`${Math.random()}_${Date.now()}`) : filename(url);
+	// Append a UUID because simply checking if the file exists is not enough since
+	// multiple resources can be downloaded at the same time (race condition).
+	const mediaPath = `${tempDir}/${safeFilename(name)}_${uuid.create()}${fileExt}`;
+	return mediaPath;
+};
+
+const isValidUrl = (url: string, isDataUrl: boolean, urlProtocol?: string, allowedProtocols?: string[]) => {
+	if (!urlProtocol) return false;
 
 	// PDFs and other heavy resoucres are often served as seperate files insted of data urls, its very unlikely to encounter a pdf as a data url
 	if (isDataUrl && !url.toLowerCase().startsWith('data:image/')) {
 		logger.warn(`Resources in data URL format is only supported for images ${url}`);
-		return '';
+		return false;
 	}
 
-	const invalidProtocols = ['cid:'];
+	const defaultAllowedProtocols = ['http:', 'https:', 'data:'];
+	const allowed = allowedProtocols ?? defaultAllowedProtocols;
+	const isAllowedProtocol = allowed.includes(urlProtocol);
+
+	return isAllowedProtocol;
+};
+
+export async function downloadMediaFile(url: string, fetchOptions?: FetchOptions, allowedProtocols?: string[]) {
+	logger.info('Downloading media file', url);
+
+	// The URL we get to download have been extracted from the Markdown document
+	url = markdownUtils.unescapeLinkUrl(url);
+
+	const isDataUrl = url && url.toLowerCase().indexOf('data:') === 0;
 	const urlProtocol = urlUtils.urlProtocol(url)?.toLowerCase();
 
-	if (!urlProtocol || invalidProtocols.includes(urlProtocol)) {
+	if (!isValidUrl(url, isDataUrl, urlProtocol, allowedProtocols)) {
 		return '';
 	}
 
-	const name = isDataUrl ? md5(`${Math.random()}_${Date.now()}`) : filename(url);
-	let fileExt = isDataUrl ? mimeUtils.toFileExtension(mimeUtils.fromDataUrl(url)) : safeFileExtension(fileExtension(url).toLowerCase());
-	if (!mimeUtils.fromFileExtension(fileExt)) fileExt = ''; // If the file extension is unknown - clear it.
-	if (fileExt) fileExt = `.${fileExt}`;
-
-	// Append a UUID because simply checking if the file exists is not enough since
-	// multiple resources can be downloaded at the same time (race condition).
-	let mediaPath = `${tempDir}/${safeFilename(name)}_${uuid.create()}${fileExt}`;
+	const fileExt = getFileExtension(url, isDataUrl);
+	const mediaPath = generateMediaPath(url, isDataUrl, fileExt);
+	let newMediaPath = undefined;
 
 	try {
 		if (isDataUrl) {
 			await shim.imageFromDataUrl(url, mediaPath);
 		} else if (urlProtocol === 'file:') {
-			// Can't think of any reason to disallow this at this point
-			// if (!allowFileProtocolImages) throw new Error('For security reasons, this URL with file:// protocol cannot be downloaded');
 			const localPath = fileUriToPath(url);
 			await shim.fsDriver().copy(localPath, mediaPath);
 		} else {
 			const response = await shim.fetchBlob(url, { path: mediaPath, maxRetry: 1, ...fetchOptions });
 
-			// If we could not find the file extension from the URL, try to get it
-			// now based on the Content-Type header.
-			if (!fileExt) mediaPath = await tryToGuessExtFromMimeType(response, mediaPath);
+			if (!fileExt) {
+				// If we could not find the file extension from the URL, try to get it
+				// now based on the Content-Type header.
+				newMediaPath = await tryToGuessExtFromMimeType(response, mediaPath);
+			}
 		}
-		return mediaPath;
+		return newMediaPath ?? mediaPath;
 	} catch (error) {
 		logger.warn(`Cannot download image at ${url}`, error);
 		return '';
 	}
 }
 
-async function downloadMediaFiles(urls: string[], fetchOptions?: FetchOptions) {
+async function downloadMediaFiles(urls: string[], fetchOptions?: FetchOptions, allowedProtocols?: string[]) {
 	const PromisePool = require('es6-promise-pool');
 
 	const output: any = {};
 
 	const downloadOne = async (url: string) => {
-		const mediaPath = await downloadMediaFile(url, fetchOptions); // , allowFileProtocolImages);
+		const mediaPath = await downloadMediaFile(url, fetchOptions, allowedProtocols);
 		if (mediaPath) output[url] = { path: mediaPath, originalUrl: url };
 	};
 
@@ -374,14 +393,20 @@ async function attachImageFromDataUrl(note: any, imageDataUrl: string, cropRect:
 	return await shim.attachFileToNote(note, tempFilePath);
 }
 
-export const extractNoteFromHTML = async (requestNote: RequestNote, requestId: number, imageSizes: any, fetchOptions?: FetchOptions) => {
+export const extractNoteFromHTML = async (
+	requestNote: RequestNote,
+	requestId: number,
+	imageSizes: any,
+	fetchOptions?: FetchOptions,
+	allowedProtocols?: string[],
+) => {
 	const note = await requestNoteToNote(requestNote);
 
 	const mediaUrls = extractMediaUrls(note.markup_language, note.body);
 
 	logger.info(`Request (${requestId}): Downloading media files: ${mediaUrls.length}`);
 
-	const mediaFiles = await downloadMediaFiles(mediaUrls, fetchOptions); // , allowFileProtocolImages);
+	const mediaFiles = await downloadMediaFiles(mediaUrls, fetchOptions, allowedProtocols);
 
 	logger.info(`Request (${requestId}): Creating resources from paths: ${Object.getOwnPropertyNames(mediaFiles).length}`);
 
@@ -433,7 +458,8 @@ export default async function(request: Request, id: string = null, link: string
 
 		logger.info('Images:', imageSizes);
 
-		const extracted = await extractNoteFromHTML(requestNote, requestId, imageSizes);
+		const allowedProtocolsForDownloadMediaFiles = ['http:', 'https:', 'file:', 'data:'];
+		const extracted = await extractNoteFromHTML(requestNote, requestId, imageSizes, undefined, allowedProtocolsForDownloadMediaFiles);
 
 		let note = await Note.save(extracted.note, extracted.saveOptions);