Skip to content

Commit

Permalink
Add 'len' parameter to Match* of Dict class. This change makes the …
Browse files Browse the repository at this point in the history
…access to a c string safer.
  • Loading branch information
BYVoid committed May 9, 2020
1 parent 2337575 commit be2492c
Show file tree
Hide file tree
Showing 11 changed files with 61 additions and 40 deletions.
15 changes: 9 additions & 6 deletions src/DartsDict.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,12 @@ DartsDict::~DartsDict() { delete internal; }

size_t DartsDict::KeyMaxLength() const { return maxLength; }

Optional<const DictEntry*> DartsDict::Match(const char* word) const {
Optional<const DictEntry*> DartsDict::Match(const char* word,
size_t len) const {
Darts::DoubleArray& dict = *internal->doubleArray;
Darts::DoubleArray::result_pair_type result;

dict.exactMatchSearch(word, result);
dict.exactMatchSearch(word, result, std::min(maxLength, len));
if (result.value != -1) {
return Optional<const DictEntry*>(
lexicon->At(static_cast<size_t>(result.value)));
Expand All @@ -62,21 +63,23 @@ Optional<const DictEntry*> DartsDict::Match(const char* word) const {
}
}

Optional<const DictEntry*> DartsDict::MatchPrefix(const char* word) const {
Optional<const DictEntry*> DartsDict::MatchPrefix(const char* word,
size_t len) const {
const size_t DEFAULT_NUM_ENTRIES = 64;
Darts::DoubleArray& dict = *internal->doubleArray;
Darts::DoubleArray::value_type results[DEFAULT_NUM_ENTRIES];
Darts::DoubleArray::value_type maxMatchedResult;
size_t numMatched =
dict.commonPrefixSearch(word, results, DEFAULT_NUM_ENTRIES);
size_t numMatched = dict.commonPrefixSearch(
word, results, DEFAULT_NUM_ENTRIES, std::min(maxLength, len));
if (numMatched == 0) {
return Optional<const DictEntry*>::Null();
} else if ((numMatched > 0) && (numMatched < DEFAULT_NUM_ENTRIES)) {
maxMatchedResult = results[numMatched - 1];
} else {
Darts::DoubleArray::value_type* rematchedResults =
new Darts::DoubleArray::value_type[numMatched];
numMatched = dict.commonPrefixSearch(word, rematchedResults, numMatched);
numMatched = dict.commonPrefixSearch(word, rematchedResults, numMatched,
std::min(maxLength, len));
maxMatchedResult = rematchedResults[numMatched - 1];
delete[] rematchedResults;
}
Expand Down
5 changes: 3 additions & 2 deletions src/DartsDict.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,10 @@ class OPENCC_EXPORT DartsDict : public Dict, public SerializableDict {

virtual size_t KeyMaxLength() const;

virtual Optional<const DictEntry*> Match(const char* word) const;
virtual Optional<const DictEntry*> Match(const char* word, size_t len) const;

virtual Optional<const DictEntry*> MatchPrefix(const char* word) const;
virtual Optional<const DictEntry*> MatchPrefix(const char* word,
size_t len) const;

virtual LexiconPtr GetLexicon() const;

Expand Down
6 changes: 4 additions & 2 deletions src/Dict.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@

using namespace opencc;

Optional<const DictEntry*> Dict::MatchPrefix(const char* word) const {
Optional<const DictEntry*> Dict::MatchPrefix(const char* word,
size_t len) const {
string wordTrunc = UTF8Util::TruncateUTF8(word, KeyMaxLength());
const char* wordTruncPtr = wordTrunc.c_str() + wordTrunc.length();
for (long len = static_cast<long>(wordTrunc.length()); len > 0;) {
Expand All @@ -35,7 +36,8 @@ Optional<const DictEntry*> Dict::MatchPrefix(const char* word) const {
return Optional<const DictEntry*>::Null();
}

vector<const DictEntry*> Dict::MatchAllPrefixes(const char* word) const {
vector<const DictEntry*> Dict::MatchAllPrefixes(const char* word,
size_t len) const {
vector<const DictEntry*> matchedLengths;
string wordTrunc = UTF8Util::TruncateUTF8(word, KeyMaxLength());
const char* wordTruncPtr = wordTrunc.c_str() + wordTrunc.length();
Expand Down
17 changes: 10 additions & 7 deletions src/Dict.hpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
/*
* Open Chinese Convert
*
* Copyright 2010-2014 BYVoid <byvoid@byvoid.com>
* Copyright 2010-2020 BYVoid <byvoid@byvoid.com>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -31,41 +31,44 @@ class OPENCC_EXPORT Dict {
/**
* Matches a word exactly and returns the DictEntry or Optional::Null().
*/
virtual Optional<const DictEntry*> Match(const char* word) const = 0;
virtual Optional<const DictEntry*> Match(const char* word,
size_t len) const = 0;

/**
* Matches a word exactly and returns the DictEntry or Optional::Null().
*/
Optional<const DictEntry*> Match(const string& word) const {
return Match(word.c_str());
return Match(word.c_str(), word.length());
}

/**
* Matches the longest matched prefix of a word.
* For example given a dictionary having "a", "an", "b", "ba", "ban", "bana",
* the longest prefix of "banana" matched is "bana".
*/
virtual Optional<const DictEntry*> MatchPrefix(const char* word) const;
virtual Optional<const DictEntry*> MatchPrefix(const char* word,
size_t len) const;

/**
* Matches the longest matched prefix of a word.
*/
Optional<const DictEntry*> MatchPrefix(const string& word) const {
return MatchPrefix(word.c_str());
return MatchPrefix(word.c_str(), word.length());
}

/**
* Returns all matched prefixes of a word, sorted by the length (desc).
* For example given a dictionary having "a", "an", "b", "ba", "ban", "bana",
* all the matched prefixes of "banana" are "bana", "ban", "ba", "b".
*/
virtual vector<const DictEntry*> MatchAllPrefixes(const char* word) const;
virtual vector<const DictEntry*> MatchAllPrefixes(const char* word,
size_t len) const;

/**
* Returns all matched prefixes of a word, sorted by the length (desc).
*/
vector<const DictEntry*> MatchAllPrefixes(const string& word) const {
return MatchAllPrefixes(word.c_str());
return MatchAllPrefixes(word.c_str(), word.length());
}

/**
Expand Down
15 changes: 9 additions & 6 deletions src/DictGroup.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,31 +29,34 @@ DictGroup::~DictGroup() {}

size_t DictGroup::KeyMaxLength() const { return keyMaxLength; }

Optional<const DictEntry*> DictGroup::Match(const char* word) const {
Optional<const DictEntry*> DictGroup::Match(const char* word,
size_t len) const {
for (const auto& dict : dicts) {
const Optional<const DictEntry*>& prefix = dict->Match(word);
const Optional<const DictEntry*>& prefix = dict->Match(word, len);
if (!prefix.IsNull()) {
return prefix;
}
}
return Optional<const DictEntry*>::Null();
}

Optional<const DictEntry*> DictGroup::MatchPrefix(const char* word) const {
Optional<const DictEntry*> DictGroup::MatchPrefix(const char* word,
size_t len) const {
for (const auto& dict : dicts) {
const Optional<const DictEntry*>& prefix = dict->MatchPrefix(word);
const Optional<const DictEntry*>& prefix = dict->MatchPrefix(word, len);
if (!prefix.IsNull()) {
return prefix;
}
}
return Optional<const DictEntry*>::Null();
}

vector<const DictEntry*> DictGroup::MatchAllPrefixes(const char* word) const {
vector<const DictEntry*> DictGroup::MatchAllPrefixes(const char* word,
size_t len) const {
std::map<size_t, const DictEntry*> matched;
// Match all prefixes from all dictionaries
for (const auto& dict : dicts) {
const vector<const DictEntry*>& entries = dict->MatchAllPrefixes(word);
const vector<const DictEntry*>& entries = dict->MatchAllPrefixes(word, len);
for (const auto& entry : entries) {
size_t len = entry->KeyLength();
// If the current length has already result, skip
Expand Down
8 changes: 5 additions & 3 deletions src/DictGroup.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,13 @@ class OPENCC_EXPORT DictGroup : public Dict {

virtual size_t KeyMaxLength() const;

virtual Optional<const DictEntry*> Match(const char* word) const;
virtual Optional<const DictEntry*> Match(const char* word, size_t len) const;

virtual Optional<const DictEntry*> MatchPrefix(const char* word) const;
virtual Optional<const DictEntry*> MatchPrefix(const char* word,
size_t len) const;

virtual vector<const DictEntry*> MatchAllPrefixes(const char* word) const;
virtual vector<const DictEntry*> MatchAllPrefixes(const char* word,
size_t len) const;

virtual LexiconPtr GetLexicon() const;

Expand Down
15 changes: 9 additions & 6 deletions src/MarisaDict.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,21 +42,23 @@ MarisaDict::~MarisaDict() {}

size_t MarisaDict::KeyMaxLength() const { return maxLength; }

Optional<const DictEntry*> MarisaDict::Match(const char* word) const {
Optional<const DictEntry*> MarisaDict::Match(const char* word,
size_t len) const {
const marisa::Trie& trie = *internal->marisa;
marisa::Agent agent;
agent.set_query(word);
agent.set_query(word, std::min(maxLength, len));
if (trie.lookup(agent)) {
return Optional<const DictEntry*>(lexicon->At(agent.key().id()));
} else {
return Optional<const DictEntry*>::Null();
}
}

Optional<const DictEntry*> MarisaDict::MatchPrefix(const char* word) const {
Optional<const DictEntry*> MarisaDict::MatchPrefix(const char* word,
size_t len) const {
const marisa::Trie& trie = *internal->marisa;
marisa::Agent agent;
agent.set_query(word, maxLength);
agent.set_query(word, std::min(maxLength, len));
const DictEntry* match = nullptr;
while (trie.common_prefix_search(agent)) {
match = lexicon->At(agent.key().id());
Expand All @@ -68,10 +70,11 @@ Optional<const DictEntry*> MarisaDict::MatchPrefix(const char* word) const {
}
}

vector<const DictEntry*> MarisaDict::MatchAllPrefixes(const char* word) const {
vector<const DictEntry*> MarisaDict::MatchAllPrefixes(const char* word,
size_t len) const {
const marisa::Trie& trie = *internal->marisa;
marisa::Agent agent;
agent.set_query(word);
agent.set_query(word, std::min(maxLength, len));
vector<const DictEntry*> matches;
while (trie.common_prefix_search(agent)) {
matches.push_back(lexicon->At(agent.key().id()));
Expand Down
8 changes: 5 additions & 3 deletions src/MarisaDict.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,13 @@ class OPENCC_EXPORT MarisaDict : public Dict, public SerializableDict {

virtual size_t KeyMaxLength() const;

virtual Optional<const DictEntry*> Match(const char* word) const;
virtual Optional<const DictEntry*> Match(const char* word, size_t len) const;

virtual Optional<const DictEntry*> MatchPrefix(const char* word) const;
virtual Optional<const DictEntry*> MatchPrefix(const char* word,
size_t len) const;

virtual vector<const DictEntry*> MatchAllPrefixes(const char* word) const;
virtual vector<const DictEntry*> MatchAllPrefixes(const char* word,
size_t len) const;

virtual LexiconPtr GetLexicon() const;

Expand Down
4 changes: 3 additions & 1 deletion src/MaxMatchSegmentation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,9 @@ SegmentsPtr MaxMatchSegmentation::Segment(const string& text) const {
segLength = 0;
}
};
size_t length = text.length();
for (const char* pstr = text.c_str(); *pstr != '\0';) {
const Optional<const DictEntry*>& matched = dict->MatchPrefix(pstr);
const Optional<const DictEntry*>& matched = dict->MatchPrefix(pstr, length);
size_t matchedLength;
if (matched.IsNull()) {
matchedLength = UTF8Util::NextCharLength(pstr);
Expand All @@ -43,6 +44,7 @@ SegmentsPtr MaxMatchSegmentation::Segment(const string& text) const {
segStart = pstr + matchedLength;
}
pstr += matchedLength;
length -= matchedLength;
}
clearBuffer();
return segments;
Expand Down
4 changes: 2 additions & 2 deletions src/TextDict.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
/*
* Open Chinese Convert
*
* Copyright 2010-2014 BYVoid <byvoid@byvoid.com>
* Copyright 2010-2020 BYVoid <byvoid@byvoid.com>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -92,7 +92,7 @@ TextDictPtr TextDict::NewFromDict(const Dict& dict) {

size_t TextDict::KeyMaxLength() const { return maxLength; }

Optional<const DictEntry*> TextDict::Match(const char* word) const {
Optional<const DictEntry*> TextDict::Match(const char* word, size_t len) const {
std::unique_ptr<DictEntry> entry(new NoValueDictEntry(word));
const auto& found = std::lower_bound(lexicon->begin(), lexicon->end(), entry,
DictEntry::UPtrLessThan);
Expand Down
4 changes: 2 additions & 2 deletions src/TextDict.hpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
/*
* Open Chinese Convert
*
* Copyright 2010-2014 BYVoid <byvoid@byvoid.com>
* Copyright 2010-2020 BYVoid <byvoid@byvoid.com>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -38,7 +38,7 @@ class OPENCC_EXPORT TextDict : public Dict, public SerializableDict {

virtual size_t KeyMaxLength() const;

virtual Optional<const DictEntry*> Match(const char* word) const;
virtual Optional<const DictEntry*> Match(const char* word, size_t len) const;

virtual LexiconPtr GetLexicon() const;

Expand Down

0 comments on commit be2492c

Please sign in to comment.