forked from go-gitea/gitea
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add warning for BIDI characters in page renders and in diffs (go-gite…
…a#17562) Fix go-gitea#17514 Given the comments I've adjusted this somewhat. The numbers of characters detected are increased and include things like the use of U+300 to make à instead of à and non-breaking spaces. There is a button which can be used to escape the content to show it. Signed-off-by: Andrew Thornton <art27@cantab.net> Co-authored-by: Gwyneth Morgan <gwymor@tilde.club> Co-authored-by: silverwind <me@silverwind.io> Co-authored-by: wxiaoguang <wxiaoguang@gmail.com>
- Loading branch information
1 parent
ee60f27
commit 21ed4fd
Showing
26 changed files
with
809 additions
and
87 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,230 @@ | ||
// Copyright 2021 The Gitea Authors. All rights reserved. | ||
// Use of this source code is governed by a MIT-style | ||
// license that can be found in the LICENSE file. | ||
|
||
package charset | ||
|
||
import ( | ||
"bytes" | ||
"fmt" | ||
"io" | ||
"strings" | ||
"unicode" | ||
"unicode/utf8" | ||
|
||
"golang.org/x/text/unicode/bidi" | ||
) | ||
|
||
// EscapeStatus represents the findings of the unicode escaper | ||
type EscapeStatus struct { | ||
Escaped bool | ||
HasError bool | ||
HasBadRunes bool | ||
HasControls bool | ||
HasSpaces bool | ||
HasMarks bool | ||
HasBIDI bool | ||
BadBIDI bool | ||
HasRTLScript bool | ||
HasLTRScript bool | ||
} | ||
|
||
// Or combines two EscapeStatus structs into one representing the conjunction of the two | ||
func (status EscapeStatus) Or(other EscapeStatus) EscapeStatus { | ||
st := status | ||
st.Escaped = st.Escaped || other.Escaped | ||
st.HasError = st.HasError || other.HasError | ||
st.HasBadRunes = st.HasBadRunes || other.HasBadRunes | ||
st.HasControls = st.HasControls || other.HasControls | ||
st.HasSpaces = st.HasSpaces || other.HasSpaces | ||
st.HasMarks = st.HasMarks || other.HasMarks | ||
st.HasBIDI = st.HasBIDI || other.HasBIDI | ||
st.BadBIDI = st.BadBIDI || other.BadBIDI | ||
st.HasRTLScript = st.HasRTLScript || other.HasRTLScript | ||
st.HasLTRScript = st.HasLTRScript || other.HasLTRScript | ||
return st | ||
} | ||
|
||
// EscapeControlString escapes the unicode control sequences in a provided string and returns the findings as an EscapeStatus and the escaped string | ||
func EscapeControlString(text string) (EscapeStatus, string) { | ||
sb := &strings.Builder{} | ||
escaped, _ := EscapeControlReader(strings.NewReader(text), sb) | ||
return escaped, sb.String() | ||
} | ||
|
||
// EscapeControlBytes escapes the unicode control sequences a provided []byte and returns the findings as an EscapeStatus and the escaped []byte | ||
func EscapeControlBytes(text []byte) (EscapeStatus, []byte) { | ||
buf := &bytes.Buffer{} | ||
escaped, _ := EscapeControlReader(bytes.NewReader(text), buf) | ||
return escaped, buf.Bytes() | ||
} | ||
|
||
// EscapeControlReader escapes the unicode control sequences a provided Reader writing the escaped output to the output and returns the findings as an EscapeStatus and an error | ||
func EscapeControlReader(text io.Reader, output io.Writer) (escaped EscapeStatus, err error) { | ||
buf := make([]byte, 4096) | ||
readStart := 0 | ||
var n int | ||
var writePos int | ||
|
||
lineHasBIDI := false | ||
lineHasRTLScript := false | ||
lineHasLTRScript := false | ||
|
||
readingloop: | ||
for err == nil { | ||
n, err = text.Read(buf[readStart:]) | ||
bs := buf[:n+readStart] | ||
i := 0 | ||
|
||
for i < len(bs) { | ||
r, size := utf8.DecodeRune(bs[i:]) | ||
// Now handle the codepoints | ||
switch { | ||
case r == utf8.RuneError: | ||
if writePos < i { | ||
if _, err = output.Write(bs[writePos:i]); err != nil { | ||
escaped.HasError = true | ||
return | ||
} | ||
writePos = i | ||
} | ||
// runes can be at most 4 bytes - so... | ||
if len(bs)-i <= 3 { | ||
// if not request more data | ||
copy(buf, bs[i:]) | ||
readStart = n - i | ||
writePos = 0 | ||
continue readingloop | ||
} | ||
// this is a real broken rune | ||
escaped.HasBadRunes = true | ||
escaped.Escaped = true | ||
if err = writeBroken(output, bs[i:i+size]); err != nil { | ||
escaped.HasError = true | ||
return | ||
} | ||
writePos += size | ||
case r == '\n': | ||
if lineHasBIDI && !lineHasRTLScript && lineHasLTRScript { | ||
escaped.BadBIDI = true | ||
} | ||
lineHasBIDI = false | ||
lineHasRTLScript = false | ||
lineHasLTRScript = false | ||
|
||
case r == '\r' || r == '\t' || r == ' ': | ||
// These are acceptable control characters and space characters | ||
case unicode.IsSpace(r): | ||
escaped.HasSpaces = true | ||
escaped.Escaped = true | ||
if writePos < i { | ||
if _, err = output.Write(bs[writePos:i]); err != nil { | ||
escaped.HasError = true | ||
return | ||
} | ||
} | ||
if err = writeEscaped(output, r); err != nil { | ||
escaped.HasError = true | ||
return | ||
} | ||
writePos = i + size | ||
case unicode.Is(unicode.Bidi_Control, r): | ||
escaped.Escaped = true | ||
escaped.HasBIDI = true | ||
if writePos < i { | ||
if _, err = output.Write(bs[writePos:i]); err != nil { | ||
escaped.HasError = true | ||
return | ||
} | ||
} | ||
lineHasBIDI = true | ||
if err = writeEscaped(output, r); err != nil { | ||
escaped.HasError = true | ||
return | ||
} | ||
writePos = i + size | ||
case unicode.Is(unicode.C, r): | ||
escaped.Escaped = true | ||
escaped.HasControls = true | ||
if writePos < i { | ||
if _, err = output.Write(bs[writePos:i]); err != nil { | ||
escaped.HasError = true | ||
return | ||
} | ||
} | ||
if err = writeEscaped(output, r); err != nil { | ||
escaped.HasError = true | ||
return | ||
} | ||
writePos = i + size | ||
case unicode.Is(unicode.M, r): | ||
escaped.Escaped = true | ||
escaped.HasMarks = true | ||
if writePos < i { | ||
if _, err = output.Write(bs[writePos:i]); err != nil { | ||
escaped.HasError = true | ||
return | ||
} | ||
} | ||
if err = writeEscaped(output, r); err != nil { | ||
escaped.HasError = true | ||
return | ||
} | ||
writePos = i + size | ||
default: | ||
p, _ := bidi.Lookup(bs[i : i+size]) | ||
c := p.Class() | ||
if c == bidi.R || c == bidi.AL { | ||
lineHasRTLScript = true | ||
escaped.HasRTLScript = true | ||
} else if c == bidi.L { | ||
lineHasLTRScript = true | ||
escaped.HasLTRScript = true | ||
} | ||
} | ||
i += size | ||
} | ||
if n > 0 { | ||
// we read something... | ||
// write everything unwritten | ||
if writePos < i { | ||
if _, err = output.Write(bs[writePos:i]); err != nil { | ||
escaped.HasError = true | ||
return | ||
} | ||
} | ||
|
||
// reset the starting positions for the next read | ||
readStart = 0 | ||
writePos = 0 | ||
} | ||
} | ||
if readStart > 0 { | ||
// this means that there is an incomplete or broken rune at 0-readStart and we read nothing on the last go round | ||
escaped.Escaped = true | ||
escaped.HasBadRunes = true | ||
if err = writeBroken(output, buf[:readStart]); err != nil { | ||
escaped.HasError = true | ||
return | ||
} | ||
} | ||
if err == io.EOF { | ||
if lineHasBIDI && !lineHasRTLScript && lineHasLTRScript { | ||
escaped.BadBIDI = true | ||
} | ||
err = nil | ||
return | ||
} | ||
escaped.HasError = true | ||
return | ||
} | ||
|
||
func writeBroken(output io.Writer, bs []byte) (err error) { | ||
_, err = fmt.Fprintf(output, `<span class="broken-code-point"><%X></span>`, bs) | ||
return | ||
} | ||
|
||
func writeEscaped(output io.Writer, r rune) (err error) { | ||
_, err = fmt.Fprintf(output, `<span class="escaped-code-point" data-escaped="[U+%04X]"><span class="char">%c</span></span>`, r, r) | ||
return | ||
} |
Oops, something went wrong.