Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Convert files to utf-8 for indexing #7814

Merged
merged 12 commits into from
Aug 15, 2019
Merged
3 changes: 2 additions & 1 deletion integrations/migration-test/migration_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"code.gitea.io/gitea/models"
"code.gitea.io/gitea/models/migrations"
"code.gitea.io/gitea/modules/base"
"code.gitea.io/gitea/modules/charset"
"code.gitea.io/gitea/modules/setting"

"github.com/go-xorm/xorm"
Expand Down Expand Up @@ -106,7 +107,7 @@ func readSQLFromFile(version string) (string, error) {
if err != nil {
return "", err
}
return string(base.RemoveBOMIfPresent(bytes)), nil
return string(charset.RemoveBOMIfPresent(bytes)), nil
}

func restoreOldDB(t *testing.T, version string) bool {
Expand Down
8 changes: 4 additions & 4 deletions models/git_diff.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,15 @@ import (
"strconv"
"strings"

"code.gitea.io/gitea/modules/base"
"code.gitea.io/gitea/modules/charset"
"code.gitea.io/gitea/modules/git"
"code.gitea.io/gitea/modules/highlight"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/process"
"code.gitea.io/gitea/modules/setting"
"github.com/Unknwon/com"
"github.com/sergi/go-diff/diffmatchpatch"
"golang.org/x/net/html/charset"
stdcharset "golang.org/x/net/html/charset"
"golang.org/x/text/transform"
)

Expand Down Expand Up @@ -641,9 +641,9 @@ func ParsePatch(maxLines, maxLineCharacters, maxFiles int, reader io.Reader) (*D
buf.WriteString("\n")
}
}
charsetLabel, err := base.DetectEncoding(buf.Bytes())
charsetLabel, err := charset.DetectEncoding(buf.Bytes())
if charsetLabel != "UTF-8" && err == nil {
encoding, _ := charset.Lookup(charsetLabel)
encoding, _ := stdcharset.Lookup(charsetLabel)
if encoding != nil {
d := encoding.NewDecoder()
for _, sec := range f.Sections {
Expand Down
4 changes: 3 additions & 1 deletion models/repo_indexer.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"strings"

"code.gitea.io/gitea/modules/base"
"code.gitea.io/gitea/modules/charset"
"code.gitea.io/gitea/modules/git"
"code.gitea.io/gitea/modules/indexer"
"code.gitea.io/gitea/modules/log"
Expand Down Expand Up @@ -207,14 +208,15 @@ func addUpdate(update fileUpdate, repo *Repository, batch rupture.FlushingBatch)
if err != nil {
return err
} else if !base.IsTextFile(fileContents) {
// FIXME: UTF-16 files will probably fail here
return nil
}
indexerUpdate := indexer.RepoIndexerUpdate{
Filepath: update.Filename,
Op: indexer.RepoIndexerOpUpdate,
Data: &indexer.RepoIndexerData{
RepoID: repo.ID,
Content: string(fileContents),
Content: string(charset.ToUTF8DropErrors(fileContents)),
},
}
return indexerUpdate.AddToFlushingBatch(batch)
Expand Down
49 changes: 0 additions & 49 deletions modules/base/tool.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
package base

import (
"bytes"
"crypto/md5"
"crypto/rand"
"crypto/sha1"
Expand All @@ -26,7 +25,6 @@ import (
"strings"
"time"
"unicode"
"unicode/utf8"

"code.gitea.io/gitea/modules/git"
"code.gitea.io/gitea/modules/log"
Expand All @@ -35,12 +33,8 @@ import (

"github.com/Unknwon/com"
"github.com/Unknwon/i18n"
"github.com/gogits/chardet"
)

// UTF8BOM is the utf-8 byte-order marker
var UTF8BOM = []byte{'\xef', '\xbb', '\xbf'}

// EncodeMD5 encodes string to md5 hex value.
func EncodeMD5(str string) string {
m := md5.New()
Expand Down Expand Up @@ -68,49 +62,6 @@ func ShortSha(sha1 string) string {
return TruncateString(sha1, 10)
}

// DetectEncoding detect the encoding of content
func DetectEncoding(content []byte) (string, error) {
if utf8.Valid(content) {
log.Debug("Detected encoding: utf-8 (fast)")
return "UTF-8", nil
}

textDetector := chardet.NewTextDetector()
var detectContent []byte
if len(content) < 1024 {
// Check if original content is valid
if _, err := textDetector.DetectBest(content); err != nil {
return "", err
}
times := 1024 / len(content)
detectContent = make([]byte, 0, times*len(content))
for i := 0; i < times; i++ {
detectContent = append(detectContent, content...)
}
} else {
detectContent = content
}
result, err := textDetector.DetectBest(detectContent)
if err != nil {
return "", err
}
if result.Charset != "UTF-8" && len(setting.Repository.AnsiCharset) > 0 {
log.Debug("Using default AnsiCharset: %s", setting.Repository.AnsiCharset)
return setting.Repository.AnsiCharset, err
}

log.Debug("Detected encoding: %s", result.Charset)
return result.Charset, err
}

// RemoveBOMIfPresent removes a UTF-8 BOM from a []byte
func RemoveBOMIfPresent(content []byte) []byte {
if len(content) > 2 && bytes.Equal(content[0:3], UTF8BOM) {
return content[3:]
}
return content
}

// BasicAuthDecode decode basic auth string
func BasicAuthDecode(encoded string) (string, string, error) {
s, err := base64.StdEncoding.DecodeString(encoded)
Expand Down
36 changes: 0 additions & 36 deletions modules/base/tool_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,42 +64,6 @@ func TestShortSha(t *testing.T) {
assert.Equal(t, "veryverylo", ShortSha("veryverylong"))
}

func TestDetectEncoding(t *testing.T) {
testSuccess := func(b []byte, expected string) {
encoding, err := DetectEncoding(b)
assert.NoError(t, err)
assert.Equal(t, expected, encoding)
}
// utf-8
b := []byte("just some ascii")
testSuccess(b, "UTF-8")

// utf-8-sig: "hey" (with BOM)
b = []byte{0xef, 0xbb, 0xbf, 0x68, 0x65, 0x79}
testSuccess(b, "UTF-8")

// utf-16: "hey<accented G>"
b = []byte{0xff, 0xfe, 0x68, 0x00, 0x65, 0x00, 0x79, 0x00, 0xf4, 0x01}
testSuccess(b, "UTF-16LE")

// iso-8859-1: d<accented e>cor<newline>
b = []byte{0x44, 0xe9, 0x63, 0x6f, 0x72, 0x0a}
encoding, err := DetectEncoding(b)
assert.NoError(t, err)
// due to a race condition in `chardet` library, it could either detect
// "ISO-8859-1" or "IS0-8859-2" here. Technically either is correct, so
// we accept either.
assert.Contains(t, encoding, "ISO-8859")

setting.Repository.AnsiCharset = "placeholder"
testSuccess(b, "placeholder")

// invalid bytes
b = []byte{0xfa}
_, err = DetectEncoding(b)
assert.Error(t, err)
}

func TestBasicAuthDecode(t *testing.T) {
_, _, err := BasicAuthDecode("?")
assert.Equal(t, "illegal base64 data at input byte 0", err.Error())
Expand Down
152 changes: 152 additions & 0 deletions modules/charset/charset.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
// Copyright 2014 The Gogs Authors. All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.

package charset

import (
"bytes"
"fmt"
"unicode/utf8"

"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"

"github.com/gogits/chardet"
"golang.org/x/net/html/charset"
"golang.org/x/text/transform"
)

// UTF8BOM is the utf-8 byte-order marker
var UTF8BOM = []byte{'\xef', '\xbb', '\xbf'}

// ToUTF8WithErr converts content to UTF8 encoding
func ToUTF8WithErr(content []byte) (string, error) {
charsetLabel, err := DetectEncoding(content)
if err != nil {
return "", err
} else if charsetLabel == "UTF-8" {
return string(RemoveBOMIfPresent(content)), nil
}

encoding, _ := charset.Lookup(charsetLabel)
if encoding == nil {
return string(content), fmt.Errorf("Unknown encoding: %s", charsetLabel)
}

// If there is an error, we concatenate the nicely decoded part and the
// original left over. This way we won't lose data.
result, n, err := transform.Bytes(encoding.NewDecoder(), content)
if err != nil {
result = append(result, content[n:]...)
}

result = RemoveBOMIfPresent(result)

return string(result), err
}

// ToUTF8WithFallback detects the encoding of content and coverts to UTF-8 if possible
func ToUTF8WithFallback(content []byte) []byte {
charsetLabel, err := DetectEncoding(content)
if err != nil || charsetLabel == "UTF-8" {
return RemoveBOMIfPresent(content)
}

encoding, _ := charset.Lookup(charsetLabel)
if encoding == nil {
return content
}

// If there is an error, we concatenate the nicely decoded part and the
// original left over. This way we won't lose data.
result, n, err := transform.Bytes(encoding.NewDecoder(), content)
if err != nil {
return append(result, content[n:]...)
}

return RemoveBOMIfPresent(result)
}

// ToUTF8 converts content to UTF8 encoding and ignore error
func ToUTF8(content string) string {
res, _ := ToUTF8WithErr([]byte(content))
return res
}

// ToUTF8DropErrors makes sure the return string is valid utf-8; attempts conversion if possible
func ToUTF8DropErrors(content []byte) []byte {
charsetLabel, err := DetectEncoding(content)
if err != nil || charsetLabel == "UTF-8" {
return RemoveBOMIfPresent(content)
}

encoding, _ := charset.Lookup(charsetLabel)
if encoding == nil {
return content
}

// We ignore any non-decodable parts from the file.
// Some parts might be lost
var decoded []byte
decoder := encoding.NewDecoder()
idx := 0
for {
result, n, err := transform.Bytes(decoder, content[idx:])
decoded = append(decoded, result...)
if err == nil {
break
}
decoded = append(decoded, ' ')
idx = idx + n + 1
if idx >= len(content) {
break
}
}

return RemoveBOMIfPresent(decoded)
}

// RemoveBOMIfPresent removes a UTF-8 BOM from a []byte
func RemoveBOMIfPresent(content []byte) []byte {
if len(content) > 2 && bytes.Equal(content[0:3], UTF8BOM) {
return content[3:]
}
return content
}

// DetectEncoding detect the encoding of content
func DetectEncoding(content []byte) (string, error) {
if utf8.Valid(content) {
log.Debug("Detected encoding: utf-8 (fast)")
return "UTF-8", nil
}

textDetector := chardet.NewTextDetector()
var detectContent []byte
if len(content) < 1024 {
// Check if original content is valid
if _, err := textDetector.DetectBest(content); err != nil {
return "", err
}
times := 1024 / len(content)
detectContent = make([]byte, 0, times*len(content))
for i := 0; i < times; i++ {
detectContent = append(detectContent, content...)
}
} else {
detectContent = content
}
result, err := textDetector.DetectBest(detectContent)
if err != nil {
return "", err
}
// FIXME: to properly decouple this function the fallback ANSI charset should be passed as an argument
if result.Charset != "UTF-8" && len(setting.Repository.AnsiCharset) > 0 {
log.Debug("Using default AnsiCharset: %s", setting.Repository.AnsiCharset)
return setting.Repository.AnsiCharset, err
}

log.Debug("Detected encoding: %s", result.Charset)
return result.Charset, err
}
Loading