From 3187a89f572fac3100168e9ec624af2e4ae615a1 Mon Sep 17 00:00:00 2001 From: wxiaoguang Date: Wed, 8 Mar 2023 11:40:41 +0800 Subject: [PATCH] Do not recognize text files as audio (#23355) Close #17108 This PR uses a trick (removing the ID3 tag) to detect the content again to to see whether the content is text type. --------- Co-authored-by: delvh Co-authored-by: techknowlogick Co-authored-by: Lunny Xiao --- modules/typesniffer/typesniffer.go | 10 ++++++++++ modules/typesniffer/typesniffer_test.go | 4 ++++ 2 files changed, 14 insertions(+) diff --git a/modules/typesniffer/typesniffer.go b/modules/typesniffer/typesniffer.go index e50928e8c260a..8ac84a20953de 100644 --- a/modules/typesniffer/typesniffer.go +++ b/modules/typesniffer/typesniffer.go @@ -98,6 +98,16 @@ func DetectContentType(data []byte) SniffedType { ct = SvgMimeType } + if strings.HasPrefix(ct, "audio/") && bytes.HasPrefix(data, []byte("ID3")) { + // The MP3 detection is quite inaccurate, any content with "ID3" prefix will result in "audio/mpeg". + // So remove the "ID3" prefix and detect again, if result is text, then it must be text content. + // This works especially because audio files contain many unprintable/invalid characters like `0x00` + ct2 := http.DetectContentType(data[3:]) + if strings.HasPrefix(ct2, "text/") { + ct = ct2 + } + } + return SniffedType{ct} } diff --git a/modules/typesniffer/typesniffer_test.go b/modules/typesniffer/typesniffer_test.go index a3b47c459863e..0574aa6e9a608 100644 --- a/modules/typesniffer/typesniffer_test.go +++ b/modules/typesniffer/typesniffer_test.go @@ -87,6 +87,10 @@ func TestIsAudio(t *testing.T) { mp3, _ := base64.StdEncoding.DecodeString("SUQzBAAAAAABAFRYWFgAAAASAAADbWFqb3JfYnJhbmQAbXA0MgBUWFhYAAAAEQAAA21pbm9yX3Zl") assert.True(t, DetectContentType(mp3).IsAudio()) assert.False(t, DetectContentType([]byte("plain text")).IsAudio()) + + assert.True(t, DetectContentType([]byte("ID3Toy\000")).IsAudio()) + assert.True(t, DetectContentType([]byte("ID3Toy\n====\t* hi 🌞, ...")).IsText()) // test ID3 tag for plain text + assert.True(t, DetectContentType([]byte("ID3Toy\n====\t* hi 🌞, ..."+"🌛"[0:2])).IsText()) // test ID3 tag with incomplete UTF8 char } func TestDetectContentTypeFromReader(t *testing.T) {