diff --git a/org.eclipse.tm4e.ui/src/main/java/org/eclipse/tm4e/ui/internal/utils/CharsInputStream.java b/org.eclipse.tm4e.ui/src/main/java/org/eclipse/tm4e/ui/internal/utils/CharsInputStream.java
new file mode 100644
index 000000000..3ac479758
--- /dev/null
+++ b/org.eclipse.tm4e.ui/src/main/java/org/eclipse/tm4e/ui/internal/utils/CharsInputStream.java
@@ -0,0 +1,220 @@
+/*******************************************************************************
+ * Copyright (c) 2024 Sebastian Thomschke and others.
+ * This program and the accompanying materials are made
+ * available under the terms of the Eclipse Public License 2.0
+ * which is available at https://www.eclipse.org/legal/epl-2.0/
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ * Contributors:
+ * Sebastian Thomschke - initial implementation
+ *******************************************************************************/
+package org.eclipse.tm4e.ui.internal.utils;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetEncoder;
+import java.nio.charset.CoderResult;
+import java.nio.charset.StandardCharsets;
+import java.util.Objects;
+import java.util.function.IntSupplier;
+
+import org.eclipse.jdt.annotation.Nullable;
+
+class CharsInputStream extends InputStream {
+ @FunctionalInterface
+ interface CharsSupplier {
+ char charAt(int index) throws Exception;
+ }
+
+ enum EncoderState {
+ ENCODING,
+ FLUSHING,
+ DONE
+ }
+
+ /** 512 surrogate character pairs */
+ private static final int DEFAULT_BUFFER_SIZE = 512;
+ private static final int EOF = -1;
+
+ private final int bufferSize;
+ private final CharBuffer charBuffer;
+ private final ByteBuffer byteBuffer;
+ private final CharsetEncoder encoder;
+ private EncoderState encoderState = EncoderState.ENCODING;
+
+ private int charIndex = 0;
+ private final CharsSupplier chars;
+ private final IntSupplier charsLength;
+
+ CharsInputStream(final CharSequence chars) {
+ this(chars, null);
+ }
+
+ CharsInputStream(final CharSequence chars, final @Nullable Charset charset) {
+ this(chars, charset, DEFAULT_BUFFER_SIZE);
+ }
+
+ CharsInputStream(final CharSequence chars, final @Nullable Charset charset, final int bufferSize) {
+ this(chars::charAt, chars::length, charset, bufferSize);
+ }
+
+ CharsInputStream(final CharsSupplier chars, final IntSupplier charsLength) {
+ this(chars, charsLength, null);
+ }
+
+ /**
+ * @param chars function to access indexed chars.
+ * @param charsLength function to get the number of indexed chars provided by the chars
parameter.
+ */
+ CharsInputStream(final CharsSupplier chars, final IntSupplier charsLength, final @Nullable Charset charset) {
+ this(chars, charsLength, charset, DEFAULT_BUFFER_SIZE);
+ }
+
+ /**
+ * @param chars function to access indexed chars.
+ * @param charsLength function to get the number of indexed chars provided by the chars
parameter.
+ * @param bufferSize number of surrogate character pairs to encode at once.
+ */
+ CharsInputStream(final CharsSupplier chars, final IntSupplier charsLength, final @Nullable Charset charset, final int bufferSize) {
+ if (bufferSize < 1)
+ throw new IllegalArgumentException("[bufferSize] must be 1 or larger");
+ encoder = (charset == null ? StandardCharsets.UTF_8 : charset).newEncoder();
+
+ this.bufferSize = bufferSize;
+ charBuffer = CharBuffer.allocate(bufferSize * 2); // buffer for 2 chars (high/low surrogate)
+ byteBuffer = ByteBuffer.allocate(bufferSize * 4); // buffer for one UTF character (up to 4 bytes)
+ byteBuffer.flip();
+ charBuffer.flip();
+
+ this.chars = chars;
+ this.charsLength = charsLength;
+ }
+
+ @Override
+ public int available() {
+ final int remaining = byteBuffer.remaining();
+ return remaining == 0 ? charsLength.getAsInt() - charIndex : remaining;
+ }
+
+ private boolean flushEncoder() throws IOException {
+ if (encoderState == EncoderState.DONE)
+ return false;
+
+ if (encoderState == EncoderState.ENCODING) {
+ encoderState = EncoderState.FLUSHING;
+ }
+
+ // flush
+ byteBuffer.clear();
+ final CoderResult result = encoder.flush(byteBuffer);
+ byteBuffer.flip();
+
+ if (result.isOverflow()) // byteBuffer too small
+ return true;
+
+ if (result.isError()) {
+ result.throwException();
+ }
+
+ encoderState = EncoderState.DONE;
+ return byteBuffer.hasRemaining();
+ }
+
+ @Override
+ public int read() throws IOException {
+ if (!byteBuffer.hasRemaining() && !refillBuffer())
+ return EOF;
+ return byteBuffer.get() & 0xFF; // next byte as an unsigned integer (0 to 255)
+ }
+
+ @Override
+ public int read(final byte[] buf, final int off, final int bytesToRead) throws IOException {
+ Objects.checkFromIndexSize(off, bytesToRead, buf.length);
+ if (bytesToRead == 0)
+ return 0;
+
+ int bytesRead = 0;
+ int bytesReadable = byteBuffer.remaining();
+
+ while (bytesRead < bytesToRead) {
+ if (bytesReadable == 0) {
+ if (refillBuffer()) {
+ bytesReadable = byteBuffer.remaining();
+ } else
+ return bytesRead == 0 ? EOF : bytesRead;
+ }
+
+ final int bytesToReadNow = Math.min(bytesToRead - bytesRead, bytesReadable);
+ byteBuffer.get(buf, off + bytesRead, bytesToReadNow);
+ bytesRead += bytesToReadNow;
+ bytesReadable -= bytesToReadNow;
+ }
+
+ return bytesRead;
+ }
+
+ private boolean refillBuffer() throws IOException {
+ if (encoderState == EncoderState.DONE)
+ return false;
+
+ if (encoderState == EncoderState.FLUSHING)
+ return flushEncoder();
+
+ final int charsLen = charsLength.getAsInt();
+
+ // if EOF is reached transition to flushing
+ if (charIndex >= charsLen) {
+ // finalize encoding before switching to flushing
+ byteBuffer.clear();
+ final CoderResult result = encoder.encode(CharBuffer.allocate(0), byteBuffer, true /* signal EOF */);
+ byteBuffer.flip();
+ if (result.isError()) {
+ result.throwException();
+ }
+ return flushEncoder();
+ }
+
+ try {
+ charBuffer.clear();
+ for (int i = 0; i < bufferSize && charIndex < charsLen; i++) {
+ final char nextChar = chars.charAt(charIndex++);
+ if (Character.isHighSurrogate(nextChar)) { // handle surrogate pairs
+ if (charIndex < charsLen) {
+ final char lowSurrogate = chars.charAt(charIndex);
+ if (Character.isLowSurrogate(lowSurrogate)) {
+ charIndex++;
+ charBuffer.put(nextChar);
+ charBuffer.put(lowSurrogate);
+ } else {
+ // missing low surrogate - fallback to replacement character
+ charBuffer.put('\uFFFD');
+ }
+ } else {
+ // missing low surrogate - fallback to replacement character
+ charBuffer.put('\uFFFD');
+ break;
+ }
+ } else {
+ charBuffer.put(nextChar);
+ }
+ }
+ charBuffer.flip();
+
+ // encode chars into bytes
+ byteBuffer.clear();
+ final CoderResult result = encoder.encode(charBuffer, byteBuffer, false);
+ byteBuffer.flip();
+ if (result.isError()) {
+ result.throwException();
+ }
+ } catch (final Exception ex) {
+ throw new IOException(ex);
+ }
+
+ return true;
+ }
+}
diff --git a/org.eclipse.tm4e.ui/src/main/java/org/eclipse/tm4e/ui/internal/utils/ContentTypeHelper.java b/org.eclipse.tm4e.ui/src/main/java/org/eclipse/tm4e/ui/internal/utils/ContentTypeHelper.java
index 43cc25410..877179036 100644
--- a/org.eclipse.tm4e.ui/src/main/java/org/eclipse/tm4e/ui/internal/utils/ContentTypeHelper.java
+++ b/org.eclipse.tm4e.ui/src/main/java/org/eclipse/tm4e/ui/internal/utils/ContentTypeHelper.java
@@ -112,7 +112,7 @@ private static ContentTypeInfo getContentTypes(final ITextFileBuffer buffer) {
if (bufferContentType != null) {
contentTypes.add(bufferContentType);
}
- if (buffer.isDirty()) {
+ if (buffer.isDirty() && buffer.getDocument() != null) {
// Buffer is dirty (content of the filesystem is not synch with
// the editor content), use IDocument content.
try (var input = new DocumentInputStream(buffer.getDocument())) {
diff --git a/org.eclipse.tm4e.ui/src/main/java/org/eclipse/tm4e/ui/internal/utils/DocumentInputStream.java b/org.eclipse.tm4e.ui/src/main/java/org/eclipse/tm4e/ui/internal/utils/DocumentInputStream.java
index 69bc716d3..c5934133a 100644
--- a/org.eclipse.tm4e.ui/src/main/java/org/eclipse/tm4e/ui/internal/utils/DocumentInputStream.java
+++ b/org.eclipse.tm4e.ui/src/main/java/org/eclipse/tm4e/ui/internal/utils/DocumentInputStream.java
@@ -1,71 +1,45 @@
/*******************************************************************************
- * Copyright (c) 2005, 2008 IBM Corporation and others.
- * All rights reserved. This program and the accompanying materials
- * are made available under the terms of the Eclipse Public License v1.0
- * which accompanies this distribution, and is available at
- * http://www.eclipse.org/legal/epl-v10.html
+ * Copyright (c) 2024 Sebastian Thomschke and others.
+ * This program and the accompanying materials are made
+ * available under the terms of the Eclipse Public License 2.0
+ * which is available at https://www.eclipse.org/legal/epl-2.0/
+ *
+ * SPDX-License-Identifier: EPL-2.0
*
* Contributors:
- * IBM Corporation - initial API and implementation
- * QNX Software System
- * Sebastian Thomschke - implement read(byte[], int, int)
+ * Sebastian Thomschke - initial implementation
*******************************************************************************/
package org.eclipse.tm4e.ui.internal.utils;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Objects;
+import java.nio.charset.Charset;
-import org.eclipse.jface.text.BadLocationException;
+import org.eclipse.core.filebuffers.FileBuffers;
+import org.eclipse.core.filebuffers.ITextFileBuffer;
+import org.eclipse.core.filebuffers.ITextFileBufferManager;
+import org.eclipse.jdt.annotation.Nullable;
import org.eclipse.jface.text.IDocument;
+import org.eclipse.tm4e.ui.TMUIPlugin;
-/**
- * Input stream which reads from a document
- */
-final class DocumentInputStream extends InputStream {
-
- private final IDocument doc;
- private int pos = 0;
-
- DocumentInputStream(final IDocument document) {
- doc = document;
- }
-
- @Override
- public int read(final byte[] buff, final int buffOffset, final int len) throws IOException {
- Objects.checkFromIndexSize(buffOffset, len, buff.length);
+final class DocumentInputStream extends CharsInputStream {
- if (len == 0)
- return 0;
-
- final var docLen = doc.getLength();
- if (pos >= docLen)
- return -1;
-
- var bytesRead = -1;
+ private static @Nullable Charset getCharset(final IDocument document) {
+ final ITextFileBufferManager bufferManager = FileBuffers.getTextFileBufferManager();
+ if (bufferManager == null)
+ return null;
+ final ITextFileBuffer buffer = bufferManager.getTextFileBuffer(document);
+ if (buffer == null)
+ return null;
try {
- buff[buffOffset] = (byte) doc.getChar(pos++);
- bytesRead = 1;
- while (bytesRead < len) {
- if (pos >= docLen) {
- break;
- }
- buff[buffOffset + bytesRead++] = (byte) doc.getChar(pos++);
- }
- } catch (final BadLocationException ex) {
- // ignore
+ final String charsetName = buffer.getEncoding();
+ if (charsetName != null)
+ return Charset.forName(charsetName);
+ } catch (final Exception ex) {
+ TMUIPlugin.logError(ex);
}
- return bytesRead;
+ return null;
}
- @Override
- public int read() throws IOException {
- try {
- if (pos < doc.getLength())
- return doc.getChar(pos++) & 0xFF;
- } catch (final BadLocationException ex) {
- // ignore
- }
- return -1;
+ DocumentInputStream(final IDocument doc) {
+ super(doc::getChar, doc::getLength, getCharset(doc));
}
}
diff --git a/org.eclipse.tm4e.ui/src/test/java/org/eclipse/tm4e/ui/internal/utils/CharsInputStreamTest.java b/org.eclipse.tm4e.ui/src/test/java/org/eclipse/tm4e/ui/internal/utils/CharsInputStreamTest.java
new file mode 100644
index 000000000..0c0d88015
--- /dev/null
+++ b/org.eclipse.tm4e.ui/src/test/java/org/eclipse/tm4e/ui/internal/utils/CharsInputStreamTest.java
@@ -0,0 +1,125 @@
+/*******************************************************************************
+ * Copyright (c) 2024 Sebastian Thomschke and others.
+ * This program and the accompanying materials are made
+ * available under the terms of the Eclipse Public License 2.0
+ * which is available at https://www.eclipse.org/legal/epl-2.0/
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ * Contributors:
+ * Sebastian Thomschke - initial implementation
+ *******************************************************************************/
+package org.eclipse.tm4e.ui.internal.utils;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.Assert.*;
+
+import java.io.IOException;
+import java.util.ArrayList;
+
+import org.junit.Test;
+
+public class CharsInputStreamTest {
+ private static final String TEST_ASCII = "Hello, World!";
+
+ private static final String EMOJI = "๐";
+ private static final int EMOJI_BYTES_LEN = EMOJI.getBytes(UTF_8).length;
+ private static final String JAPANESE = "ใใใซใกใฏ";
+ private static final String TEST_UNICODE = EMOJI + JAPANESE;
+ private static final int TEST_UNICODE_BYTES_LEN = TEST_UNICODE.getBytes(UTF_8).length;
+
+ @Test
+ public void testAvailable() throws IOException {
+ try (var is = new CharsInputStream(TEST_ASCII)) {
+ assertEquals(TEST_ASCII.length(), is.available());
+ final byte[] buffer = new byte[4];
+ is.read(buffer);
+ assertEquals(TEST_ASCII.length() - 4, is.available());
+ is.readAllBytes();
+ assertEquals(0, is.available());
+ }
+
+ try (var is = new CharsInputStream(TEST_UNICODE)) {
+ assertTrue(is.available() > 0);
+ is.read(new byte[10]);
+ assertTrue(is.available() > 0);
+ is.readAllBytes();
+ assertEquals(0, is.available());
+ }
+ }
+
+ @Test
+ public void testEndOfStream() throws IOException {
+ try (var is = new CharsInputStream(TEST_UNICODE)) {
+ is.skip(Long.MAX_VALUE);
+ assertEquals(-1, is.read());
+ }
+ }
+
+ @Test
+ public void testReadEachByte() throws IOException {
+ try (var is = new CharsInputStream(TEST_UNICODE)) {
+ final var bytesRead = new ArrayList();
+ int b;
+ while ((b = is.read()) != -1) {
+ bytesRead.add((byte) b);
+ }
+
+ final byte[] byteArray = new byte[bytesRead.size()];
+ for (int i = 0; i < bytesRead.size(); i++) {
+ byteArray[i] = bytesRead.get(i);
+ }
+ assertEquals(TEST_UNICODE, new String(byteArray, UTF_8));
+ }
+ }
+
+ @Test
+ public void testReadIntoByteArray() throws IOException {
+ final byte[] buffer = new byte[1024]; // Buffer to read a portion of the text
+
+ try (var is = new CharsInputStream(TEST_UNICODE)) {
+ final int bytesRead = is.read(buffer, 0, buffer.length);
+
+ assertEquals(TEST_UNICODE_BYTES_LEN, bytesRead);
+ assertEquals(TEST_UNICODE, new String(buffer, 0, bytesRead, UTF_8));
+ }
+ }
+
+ @Test
+ public void testSkip() throws IOException {
+ try (var is = new CharsInputStream(TEST_UNICODE)) {
+ // skip emoji
+ final long skipped = is.skip(EMOJI_BYTES_LEN);
+ assertEquals(EMOJI_BYTES_LEN, skipped);
+
+ final byte[] japanese = new byte[TEST_UNICODE_BYTES_LEN];
+ final int bytesRead = is.read(japanese);
+
+ assertEquals(JAPANESE, new String(japanese, 0, bytesRead, UTF_8));
+ }
+ }
+
+ @Test
+ public void testHighSurrogateAtEndOfInput() throws IOException {
+ final char[] invalidSequence = { 'A', '\uD800' }; // valid char followed by an isolated high surrogate
+ try (var is = new CharsInputStream(new String(invalidSequence), UTF_8)) {
+ final byte[] result = is.readAllBytes();
+ final String output = new String(result, UTF_8);
+
+ // the high surrogate at the end should be replaced by the Unicode replacement char
+ assertEquals("A\uFFFD", output);
+ }
+ }
+
+ @Test
+ public void testHighSurrogateWithoutLowSurrogate() throws IOException {
+ final char[] invalidSequence = { '\uD800', 'A' }; // \uD800 is a high surrogate, followed by 'A'
+ try (var is = new CharsInputStream(new String(invalidSequence), UTF_8)) {
+ final byte[] result = is.readAllBytes();
+ final String output = new String(result, UTF_8);
+
+ // the invalid surrogate pair should be replaced by the Unicode replacement char
+ assertEquals("\uFFFD" + "A", output);
+ }
+ }
+}