Skip to content

Commit

Permalink
Search: Add post-modifiers ;np and ;py (#32)
Browse files Browse the repository at this point in the history
  • Loading branch information
Chaoses-Ib committed Jan 25, 2022
1 parent 0f5542c commit e0a32a7
Show file tree
Hide file tree
Showing 4 changed files with 110 additions and 56 deletions.
32 changes: 20 additions & 12 deletions Hijacker/PinyinSearchPcre.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,11 @@ struct Modifier {
Modifier::Value modifiers;
char pattern_initial;

void regcomp_p2_common(Modifier::Value* modifiers_p, char* pattern) {
void regcomp_p2_common(Modifier::Value* modifiers_p, char8_t* pattern) {
if constexpr (debug) {
size_t pattern_len = strlen(pattern);
size_t pattern_len = strlen((char*)pattern);
std::wstring pattern_u16(pattern_len, L'\0');
pattern_u16.resize(MultiByteToWideChar(CP_UTF8, 0, pattern, pattern_len, pattern_u16.data(), pattern_u16.size()));
pattern_u16.resize(MultiByteToWideChar(CP_UTF8, 0, (char*)pattern, pattern_len, pattern_u16.data(), pattern_u16.size()));
DebugOStream() << L"regcomp_p2_common(" << std::hex << *modifiers_p << LR"(, ")" << pattern_u16 << LR"("))"
<< L" on " << GetCurrentThreadId() << L"\n";

Expand All @@ -70,6 +70,13 @@ void regcomp_p2_common(Modifier::Value* modifiers_p, char* pattern) {
modifiers = *modifiers_p;

if (!(modifiers & Modifier::RegEx)) {
// NoProcess post-modifier
std::u8string_view pat(pattern);
if (pat.ends_with(u8";np")) {
pattern[pat.size() - 3] = u8'\0';
return;
}

pattern_initial = pattern[0];
if (pattern_initial) {
// set regex: modifier
Expand All @@ -90,7 +97,7 @@ struct regcomp_p2_14 {
void* result20;
Modifier::Value modifiers;
uint32_t int2C;
char pattern[];
char8_t pattern[];
};
#pragma pack(pop)

Expand Down Expand Up @@ -209,7 +216,7 @@ regcomp_detour(regex_t* preg, const char* pattern, int cflags)

const_cast<char*>(pattern)[0] = pattern_initial;

preg->re_pcre = compile((const char8_t*)pattern, 0, &config.pinyin_search.flags);
preg->re_pcre = compile((const char8_t*)pattern, {}, &config.pinyin_search.flags);
preg->re_nsub = 0;
preg->re_erroffset = (size_t)-1;

Expand Down Expand Up @@ -424,10 +431,10 @@ regexec_detour(const regex_t* preg, const char* string, size_t nmatch,
if (modifiers & Modifier::RegEx) {
string -= start;
error = regexec_real(preg, string, nmatch, pmatch, eflags);
rc = preg->re_nsub;
rc = 1 + preg->re_nsub;
}
else {
rc = exec((Pattern*)preg->re_pcre, (const char8_t*)string, length, nmatch, (int*)pmatch, 0);
rc = exec((Pattern*)preg->re_pcre, (const char8_t*)string, length, nmatch, (int*)pmatch, {});
if (rc == -1)
error = REG_NOMATCH;
else
Expand All @@ -440,10 +447,6 @@ regexec_detour(const regex_t* preg, const char* string, size_t nmatch,
}

for (int i = 0; i < rc; i++) {
if (!(modifiers & Modifier::RegEx) && eflags & REG_STARTEND) {
pmatch[i].rm_so += start;
pmatch[i].rm_eo += start;
}
dout << L"{" << pmatch[i].rm_so << L"," << pmatch[i].rm_eo << L"}";
}

Expand All @@ -466,16 +469,21 @@ regexec_detour(const regex_t* preg, const char* string, size_t nmatch,
length = strlen(string);
}

int rc = exec((Pattern*)preg->re_pcre, (const char8_t*)string, length, nmatch, (int*)pmatch, 0);
int rc = exec((Pattern*)preg->re_pcre, (const char8_t*)string, length, nmatch, (int*)pmatch, {});
if (rc == -1) {
return REG_NOMATCH;
}

// Everything removes this characteristic from PCRE regexec
/*
if (eflags & REG_STARTEND) {
for (int i = 0; i < rc; i++) {
pmatch[i].rm_so += start;
pmatch[i].rm_eo += start;
}
}
*/

return 0;
}

Expand Down
106 changes: 75 additions & 31 deletions Hijacker/match.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ char32_t read_char32(const char8_t* str, int* length) {
}
}

Pattern* compile(const char8_t* pattern, PatternFlag::Value flags, std::vector<pinyin::PinyinFlagValue>* pinyin_flags) {
size_t length = 1; // '\0'
Pattern* compile(const char8_t* pattern, PatternFlag flags, std::vector<pinyin::PinyinFlagValue>* pinyin_flags) {
size_t length = 0;
size_t length_u8 = 0;
{
const char8_t* p = pattern;
Expand All @@ -22,53 +22,85 @@ Pattern* compile(const char8_t* pattern, PatternFlag::Value flags, std::vector<p
length++;
length_u8 += char_len;
}

// parse post-modifiers
std::u8string_view pat(pattern, length_u8);
if (pat.ends_with(u8";py")) {
flags.pinyin = true;
length -= 3;
length_u8 -= 3;
}
}
//Pattern* pat = ib::Addr(new ib::Byte[sizeof Pattern + length * sizeof(char32_t)]);
Pattern* pat = ib::Addr(HeapAlloc(GetProcessHeap(), 0, sizeof Pattern + length * sizeof(char32_t) + length_u8 * sizeof(char8_t)));
// `pat = new()` cause crashes when using Debug CRT
Pattern* pat = ib::Addr(HeapAlloc(GetProcessHeap(), 0, sizeof Pattern + (length + 1) * sizeof(char32_t) + length_u8 * sizeof(char8_t)));

pat->flags = flags;
pat->pinyin_flags = pinyin_flags;

pat->pattern_len = length - 1;
pat->pattern_len = length;
pat->pattern_u8_len = length_u8;

pat->flags.no_lower_letter_ = true;
const char8_t* p = pattern;
int char_len;
for (size_t i = 0; i < length; i++) {
pat->pattern()[i] = read_char32(p, &char_len);
char32_t c = read_char32(p, &char_len);
pat->pattern()[i] = c;
p += char_len;

if (U'a' <= c && c <= U'z')
pat->flags.no_lower_letter_ = false;
}
pat->pattern()[length] = U'\0';

memcpy(pat->pattern_u8(), pattern, length_u8);

return pat;
}

int exec(Pattern* pattern, const char8_t* subject, int length, size_t nmatch, int pmatch[], PatternFlag::Value flags)
int exec(Pattern* pattern, const char8_t* subject, int length, size_t nmatch, int pmatch[], PatternFlag exec_flags)
{
const char8_t* subject_end = subject + length;

// plain text match
bool plain = true;
{
// no-hanzi text match
bool no_hanzi = pattern->flags.no_lower_letter_;
if (!no_hanzi) [[likely]] {
no_hanzi = true;
const char8_t* s = subject;
int char_len;
for (char32_t c = read_char32(s, &char_len); s != subject_end; c = read_char32(s += char_len, &char_len)) {
if (c >= 0x3007) {
plain = false;
if (c >= 0x3007) [[unlikely]] {
no_hanzi = false;
break;
}
}
}
if (plain) {
if (no_hanzi) {
// main performance influencing code

// default: a -> [aA嗷] -> [aA], A -> [aA]
// pinyin: a -> [嗷] -> (?!), A -> [aA]

std::u8string_view sv(subject, length);
std::u8string_view pt = pattern->pattern_u8_sv();
auto it = std::search(sv.begin(), sv.end(), pt.begin(), pt.end(),
[](char8_t c1, char8_t c2) {
return std::toupper(c1) == std::toupper(c2);
});
std::u8string_view::const_iterator it;
if (!pattern->flags.pinyin) /* default */ {
it = std::search(sv.begin(), sv.end(), pt.begin(), pt.end(),
[](char8_t c1, char8_t c2) {
return std::toupper(c1) == std::toupper(c2);
});
}
else /* pinyin */ {
if (!pattern->flags.no_lower_letter_) [[likely]]
return -1;

it = std::search(sv.begin(), sv.end(), pt.begin(), pt.end(),
[](char8_t c1, char8_t c2) {
return std::toupper(c1) == c2;
});
}

if (it == sv.end()) {
if (it == sv.end()) [[likely]] {
return -1;
} else {
if (nmatch) {
Expand All @@ -84,24 +116,36 @@ int exec(Pattern* pattern, const char8_t* subject, int length, size_t nmatch, in
// DFA?
auto char_match = [pattern](char32_t c, const char32_t* pat) -> std::vector<size_t> {
std::vector<size_t> v;
if (c == *pat)
v.push_back(1);
else {
if (c < 0x3007) {
if (U'A' <= c && c <= U'Z') {
if (*pat == c - U'A' + U'a')
v.push_back(1);
} else if (U'a' <= c && c <= U'z') {
if (*pat == c - U'a' + U'A')
v.push_back(1);
}
} else {
if (pattern->flags.pinyin) {
if (c >= 0x3007) {
for (pinyin::PinyinFlagValue flag : *pattern->pinyin_flags) {
if (size_t size = pinyin::match_pinyin(pat, c, flag))
if (size_t size = pinyin::match_pinyin(pat, c, flag)) [[unlikely]]
v.push_back(size);
}
}
}
else [[likely]] {
if (c == *pat)
v.push_back(1);
else {
if (c < 0x3007) {
if (U'A' <= c && c <= U'Z') {
if (*pat == c - U'A' + U'a')
v.push_back(1);
}
else if (U'a' <= c && c <= U'z') {
if (*pat == c - U'a' + U'A')
v.push_back(1);
}
}
else {
for (pinyin::PinyinFlagValue flag : *pattern->pinyin_flags) {
if (size_t size = pinyin::match_pinyin(pat, c, flag)) [[unlikely]]
v.push_back(size);
}
}
}
}
return v;
};
std::function<const char8_t* (const char8_t*, const char32_t*)> subject_match = [&char_match, &subject_match, subject_end](const char8_t* sub, const char32_t* pattern) -> const char8_t* {
Expand Down
15 changes: 7 additions & 8 deletions Hijacker/match.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,24 +5,23 @@
#include <IbPinyinLib/Pinyin.hpp>

struct PatternFlag {
using Value = unsigned int;
using T = const Value;
//static T case_ = 1;
//static T wildcards = 2;
//static T py = 3;
bool pinyin : 1;
bool no_lower_letter_ : 1;
};

struct Pattern {
PatternFlag::Value flags;
PatternFlag flags;
std::vector<pinyin::PinyinFlagValue>* pinyin_flags;
unsigned int pattern_len;
unsigned int pattern_u8_len;
//char32_t pattern[];
//char8_t pattern_u8[];

// null-terminated
char32_t* pattern() {
return ib::Addr(this) + sizeof(Pattern);
}
// not null-terminated
char8_t* pattern_u8() {
return ib::Addr(this) + sizeof(Pattern) + (pattern_len + 1) * sizeof(char32_t);
}
Expand All @@ -31,6 +30,6 @@ struct Pattern {
}
};

Pattern* compile(const char8_t* pattern, PatternFlag::Value flags, std::vector<pinyin::PinyinFlagValue>* pinyin_flags);
Pattern* compile(const char8_t* pattern, PatternFlag flags, std::vector<pinyin::PinyinFlagValue>* pinyin_flags);

int exec(Pattern* pattern, const char8_t* subject, int length, size_t nmatch, int pmatch[], PatternFlag::Value flags);
int exec(Pattern* pattern, const char8_t* subject, int length, size_t nmatch, int pmatch[], PatternFlag flags);
13 changes: 8 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,20 @@

### PCRE 模式
* 支持简拼、全拼、带声调全拼、小鹤双拼搜索。(双拼搜索默认不开启)
* 仅支持 Everything 以下版本:
* v1.4.1.1009 x64 [安装版](https://www.voidtools.com/Everything-1.4.1.1009.x64-Setup.exe)/[便携版](https://www.voidtools.com/Everything-1.4.1.1009.x64.zip)
* 支持 Everything 以下版本:
* v1.4.1.1015 x64 [安装版](https://www.voidtools.com/Everything-1.4.1.1015.x64-Setup.exe)/[便携版](https://www.voidtools.com/Everything-1.4.1.1015.x64.zip)
* v1.4.1.1009 x64 [安装版](https://www.voidtools.com/Everything-1.4.1.1009.x64-Setup.exe)/[便携版](https://www.voidtools.com/Everything-1.4.1.1009.x64.zip)
* v1.5.0.1296a x64 [安装版](https://www.voidtools.com/Everything-1.5.0.1296a.x64-Setup.exe)/[便携版](https://www.voidtools.com/Everything-1.5.0.1296a.x64.zip)
* 后置修饰符:
* `;py`:小写字母只匹配拼音
* `;np`:禁用拼音搜索

### Edit 模式(停止维护)
* 只支持简拼搜索。
* 支持 Everything x64 安装版和便携版,不支持精简版。
* 修饰符
* py: 小写字母只匹配拼音
* nopy: 禁用拼音搜索(对所有关键字生效)
* 修饰符
* `py:` 小写字母只匹配拼音
* `nopy:` 禁用拼音搜索(对所有关键字生效)

<img src="docs/search.png" style="max-height: 500px;"/>

Expand Down

0 comments on commit e0a32a7

Please sign in to comment.