Skip to content

Commit

Permalink
Desktop: PDF search text: Remove NULL characters early to avoid possi…
Browse files Browse the repository at this point in the history
…ble sync issues (#9862)
  • Loading branch information
personalizedrefrigerator authored Feb 6, 2024
1 parent 8b9ce9e commit a906e73
Show file tree
Hide file tree
Showing 7 changed files with 49 additions and 5 deletions.
2 changes: 2 additions & 0 deletions .eslintignore
Original file line number Diff line number Diff line change
Expand Up @@ -1002,6 +1002,8 @@ packages/lib/types.js
packages/lib/utils/credentialFiles.js
packages/lib/utils/joplinCloud.js
packages/lib/utils/processStartFlags.js
packages/lib/utils/replaceUnsupportedCharacters.test.js
packages/lib/utils/replaceUnsupportedCharacters.js
packages/lib/utils/userFetcher.js
packages/lib/utils/webDAVUtils.test.js
packages/lib/utils/webDAVUtils.js
Expand Down
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -982,6 +982,8 @@ packages/lib/types.js
packages/lib/utils/credentialFiles.js
packages/lib/utils/joplinCloud.js
packages/lib/utils/processStartFlags.js
packages/lib/utils/replaceUnsupportedCharacters.test.js
packages/lib/utils/replaceUnsupportedCharacters.js
packages/lib/utils/userFetcher.js
packages/lib/utils/webDAVUtils.test.js
packages/lib/utils/webDAVUtils.js
Expand Down
13 changes: 12 additions & 1 deletion packages/lib/services/search/SearchEngine.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -322,10 +322,21 @@ describe('services/SearchEngine', () => {
}));

it('should support searching through documents that contain null characters', (async () => {
await Note.save({ title: 'Test', body: 'Test\x00testing' });
await Note.save({
title: 'Test',
body: `
NUL characters, "\x00", have been known to break FTS search.
Previously, all characters after a NUL (\x00) character in a note
would not show up in search results. NUL characters may have also
broken search for other notes.
In this note, "testing" only appears after the NUL characters.
`,
});

await engine.syncTables();

expect((await engine.search('previously')).length).toBe(1);
expect((await engine.search('testing')).length).toBe(1);
}));

Expand Down
6 changes: 3 additions & 3 deletions packages/lib/services/search/SearchEngine.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import JoplinDatabase from '../../JoplinDatabase';
import NoteResource from '../../models/NoteResource';
import BaseItem from '../../models/BaseItem';
import { isCallbackUrl, parseCallbackUrl } from '../../callbackUrlUtils';
import replaceUnsupportedCharacters from '../../utils/replaceUnsupportedCharacters';
const { sprintf } = require('sprintf-js');
const { pregQuote, scriptType, removeDiacritics } = require('../../string-utils.js');

Expand Down Expand Up @@ -603,9 +604,8 @@ export default class SearchEngine {
private normalizeText_(text: string) {
let normalizedText = text.normalize ? text.normalize() : text;

// Null characters can break FTS. Remove them.
// eslint-disable-next-line no-control-regex
normalizedText = normalizedText.replace(/\x00/g, ' ');
// NULL characters can break FTS. Remove them.
normalizedText = replaceUnsupportedCharacters(normalizedText);

return removeDiacritics(normalizedText.toLowerCase());
}
Expand Down
6 changes: 5 additions & 1 deletion packages/lib/shim-init-node.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import * as pdfJsNamespace from 'pdfjs-dist';
import { writeFile } from 'fs/promises';
import { ResourceEntity } from './services/database/types';
import { TextItem } from 'pdfjs-dist/types/src/display/api';
import replaceUnsupportedCharacters from './utils/replaceUnsupportedCharacters';

const { FileApiDriverLocal } = require('./file-api-driver-local');
const mimeUtils = require('./mime-utils.js').mime;
Expand Down Expand Up @@ -749,7 +750,10 @@ function shimInit(options: ShimInitOptions = null) {
const text = (item as TextItem).str ?? '';
return text;
}).join('\n');
textByPage.push(strings);

// Some PDFs contain unsupported characters that can lead to hard-to-debug issues.
// We remove them here.
textByPage.push(replaceUnsupportedCharacters(strings));
}

return textByPage;
Expand Down
8 changes: 8 additions & 0 deletions packages/lib/utils/replaceUnsupportedCharacters.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import replaceUnsupportedCharacters from './replaceUnsupportedCharacters';

describe('replaceUnsupportedCharacters', () => {
test('should replace NULL characters', () => {
expect(replaceUnsupportedCharacters('Test\x00...')).toBe('Test�...');
expect(replaceUnsupportedCharacters('\x00Test\x00...')).toBe('�Test�...');
});
});
17 changes: 17 additions & 0 deletions packages/lib/utils/replaceUnsupportedCharacters.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@

const replaceUnsupportedCharacters = (text: string) => {
// In the past, NULL characters have caused sync and search issues.
// Because these issues are often difficult to debug, we remove these characters entirely.
//
// See
// - Sync issue: https://github.com/laurent22/joplin/issues/5046
// - Search issue: https://github.com/laurent22/joplin/issues/9775
//
// As per the commonmark spec, we replace \x00 with the replacement character.
// (see https://spec.commonmark.org/0.31.2/#insecure-characters).
//
// eslint-disable-next-line no-control-regex
return text.replace(/\x00/g, '\uFFFD');
};

export default replaceUnsupportedCharacters;

0 comments on commit a906e73

Please sign in to comment.