Skip to content

Commit

Permalink
jruby: Implement set_encoding_by_bom (#101)
Browse files Browse the repository at this point in the history
Fix GH-100

---------

Co-authored-by: Sutou Kouhei <kou@cozmixng.org>
  • Loading branch information
headius and kou authored Sep 25, 2024
1 parent 225db02 commit cb46167
Showing 1 changed file with 96 additions and 8 deletions.
104 changes: 96 additions & 8 deletions ext/java/org/jruby/ext/stringio/StringIO.java
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,11 @@

import org.jcodings.Encoding;
import org.jcodings.specific.ASCIIEncoding;
import org.jcodings.specific.UTF16BEEncoding;
import org.jcodings.specific.UTF16LEEncoding;
import org.jcodings.specific.UTF32BEEncoding;
import org.jcodings.specific.UTF32LEEncoding;
import org.jcodings.specific.UTF8Encoding;
import org.jruby.*;
import org.jruby.anno.FrameField;
import org.jruby.anno.JRubyClass;
Expand All @@ -51,8 +56,10 @@
import org.jruby.util.ByteList;
import org.jruby.util.StringSupport;
import org.jruby.util.TypeConverter;
import org.jruby.util.func.ObjectObjectIntFunction;
import org.jruby.util.io.EncodingUtils;
import org.jruby.util.io.Getline;
import org.jruby.util.io.IOEncodable;
import org.jruby.util.io.ModeFlags;
import org.jruby.util.io.OpenFile;

Expand All @@ -62,6 +69,7 @@
import java.util.Arrays;
import java.util.concurrent.atomic.AtomicReferenceFieldUpdater;

import static java.lang.Byte.toUnsignedInt;
import static org.jruby.RubyEnumerator.enumeratorize;
import static org.jruby.runtime.Visibility.PRIVATE;
import static org.jruby.util.RubyStringBuilder.str;
Expand Down Expand Up @@ -93,6 +101,10 @@ static class StringIOData {

private static final AtomicReferenceFieldUpdater<StringIOData, Object> LOCKED_UPDATER = AtomicReferenceFieldUpdater.newUpdater(StringIOData.class, Object.class, "owner");

private static final ThreadLocal<Object> VMODE_VPERM_TL = ThreadLocal.withInitial(() -> EncodingUtils.vmodeVperm(null, null));
private static final ThreadLocal<int[]> FMODE_TL = ThreadLocal.withInitial(() -> new int[]{0});
private static final int[] OFLAGS_UNUSED = new int[]{0};

public static RubyClass createStringIOClass(final Ruby runtime) {
RubyClass stringIOClass = runtime.defineClass(
"StringIO", runtime.getObject(), StringIO::new);
Expand Down Expand Up @@ -298,12 +310,22 @@ private void strioInit(ThreadContext context, int argc, IRubyObject arg0, IRubyO
Encoding encoding = null;

IRubyObject options = ArgsUtil.getOptionsArg(runtime, maybeOptions);
IOEncodable.ConvConfig ioEncodable = new IOEncodable.ConvConfig();
if (!options.isNil()) {
argc--;
IRubyObject encodingOpt = ArgsUtil.extractKeywordArg(context, "encoding", (RubyHash) options);
if (!encodingOpt.isNil()) {
encoding = EncodingUtils.toEncoding(context, encodingOpt);
}

int[] fmode = {0};
Object vmodeAndVpermP = VMODE_VPERM_TL.get();

// switch to per-use oflags if it is ever used in the future
EncodingUtils.extractModeEncoding(context, ioEncodable, vmodeAndVpermP, options, OFLAGS_UNUSED, FMODE_TL.get());

// clear shared vmodeVperm
EncodingUtils.vmode(vmodeAndVpermP, null);
EncodingUtils.vperm(vmodeAndVpermP, null);

ptr.flags = fmode[0];
encoding = ioEncodable.enc;
}

switch (argc) {
Expand All @@ -312,11 +334,11 @@ private void strioInit(ThreadContext context, int argc, IRubyObject arg0, IRubyO
final boolean trunc;
if (mode instanceof RubyFixnum) {
int flags = RubyFixnum.fix2int(mode);
ptr.flags = ModeFlags.getOpenFileFlagsFor(flags);
ptr.flags |= ModeFlags.getOpenFileFlagsFor(flags);
trunc = (flags & ModeFlags.TRUNC) != 0;
} else {
String m = arg1.convertToString().toString();
ptr.flags = OpenFile.ioModestrFmode(runtime, m);
ptr.flags |= OpenFile.ioModestrFmode(runtime, m);
trunc = m.length() > 0 && m.charAt(0) == 'w';
}
string = arg0.convertToString();
Expand All @@ -329,11 +351,11 @@ private void strioInit(ThreadContext context, int argc, IRubyObject arg0, IRubyO
break;
case 1:
string = arg0.convertToString();
ptr.flags = string.isFrozen() ? OpenFile.READABLE : OpenFile.READWRITE;
ptr.flags |= string.isFrozen() ? OpenFile.READABLE : OpenFile.READWRITE;
break;
case 0:
string = RubyString.newEmptyString(runtime, runtime.getDefaultExternalEncoding());
ptr.flags = OpenFile.READWRITE;
ptr.flags |= OpenFile.READWRITE;
break;
default:
// should not be possible
Expand All @@ -344,6 +366,7 @@ private void strioInit(ThreadContext context, int argc, IRubyObject arg0, IRubyO
ptr.enc = encoding;
ptr.pos = 0;
ptr.lineno = 0;
if ((ptr.flags & OpenFile.SETENC_BY_BOM) != 0) setEncodingByBOM(context);
// funky way of shifting readwrite flags into object flags
flags |= (ptr.flags & OpenFile.READWRITE) * (STRIO_READABLE / OpenFile.READABLE);
} finally {
Expand Down Expand Up @@ -1636,6 +1659,71 @@ public IRubyObject set_encoding(ThreadContext context, IRubyObject enc, IRubyObj
return set_encoding(context, enc);
}

@JRubyMethod
public IRubyObject set_encoding_by_bom(ThreadContext context) {
if (setEncodingByBOM(context) == null) return context.nil;

return context.runtime.getEncodingService().convertEncodingToRubyEncoding(ptr.enc);
}

private Encoding setEncodingByBOM(ThreadContext context) {
Encoding enc = detectBOM(context, ptr.string, (ctx, enc2, bomlen) -> {
ptr.pos = bomlen;
if (writable()) {
ptr.string.setEncoding(enc2);
}
return enc2;
});
ptr.enc = enc;
return enc;
}

private static Encoding detectBOM(ThreadContext context, RubyString str, ObjectObjectIntFunction<ThreadContext, Encoding, Encoding> callback) {
int p;
int len;

ByteList byteList = str.getByteList();
byte[] bytes = byteList.unsafeBytes();
p = byteList.begin();
len = byteList.realSize();

if (len < 1) return null;
switch (toUnsignedInt(bytes[p])) {
case 0xEF:
if (len < 3) break;
if (toUnsignedInt(bytes[p + 1]) == 0xBB && toUnsignedInt(bytes[p + 2]) == 0xBF) {
return callback.apply(context, UTF8Encoding.INSTANCE, 3);
}
break;

case 0xFE:
if (len < 2) break;
if (toUnsignedInt(bytes[p + 1]) == 0xFF) {
return callback.apply(context, UTF16BEEncoding.INSTANCE, 2);
}
break;

case 0xFF:
if (len < 2) break;
if (toUnsignedInt(bytes[p + 1]) == 0xFE) {
if (len >= 4 && toUnsignedInt(bytes[p + 2]) == 0 && toUnsignedInt(bytes[p + 3]) == 0) {
return callback.apply(context, UTF32LEEncoding.INSTANCE, 4);
}
return callback.apply(context, UTF16LEEncoding.INSTANCE, 2);
}
break;

case 0:
if (len < 4) break;
if (toUnsignedInt(bytes[p + 1]) == 0 && toUnsignedInt(bytes[p + 2]) == 0xFE && toUnsignedInt(bytes[p + 3]) == 0xFF) {
return callback.apply(context, UTF32BEEncoding.INSTANCE, 4);
}
break;
}
return callback.apply(context, null, 0);
}


@JRubyMethod
public IRubyObject external_encoding(ThreadContext context) {
return context.runtime.getEncodingService().convertEncodingToRubyEncoding(getEncoding());
Expand Down

0 comments on commit cb46167

Please sign in to comment.