-
Notifications
You must be signed in to change notification settings - Fork 566
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: spelling correction #228
Changes from 23 commits
135dc74
8b5d273
2563cdb
54e8a6f
dabc2ec
18e1b6d
11c21aa
6d71390
0450bb7
d95bb1e
e92d2df
b5410f7
b1b1ec4
c8e421b
2d52897
4c8ab25
bf9ca4d
cc5893c
36f71bc
50b713f
7caaaf6
5cda56c
d32b43f
3b2f1b1
6c36ee0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,8 +9,11 @@ | |
#include <boost/range/adaptor/reversed.hpp> | ||
#include <rime/dict/prism.h> | ||
#include <rime/algo/syllabifier.h> | ||
#include <rime/gear/corrector.h> | ||
#include "syllabifier.h" | ||
|
||
namespace rime { | ||
using namespace corrector; | ||
|
||
using Vertex = pair<size_t, SpellingType>; | ||
using VertexQueue = std::priority_queue<Vertex, | ||
|
@@ -35,16 +38,36 @@ int Syllabifier::BuildSyllableGraph(const string &input, | |
// record a visit to the vertex | ||
if (graph->vertices.find(current_pos) == graph->vertices.end()) | ||
graph->vertices.insert(vertex); // preferred spelling type comes first | ||
else | ||
else { | ||
// graph->vertices[current_pos] = std::min(vertex.second, graph->vertices[current_pos]); | ||
continue; // discard worse spelling types | ||
} | ||
|
||
if (current_pos > farthest) | ||
farthest = current_pos; | ||
DLOG(INFO) << "current_pos: " << current_pos; | ||
|
||
// see where we can go by advancing a syllable | ||
vector<Prism::Match> matches; | ||
prism.CommonPrefixSearch(input.substr(current_pos), &matches); | ||
set<SyllableId> match_set; | ||
auto current_input = input.substr(current_pos); | ||
prism.CommonPrefixSearch(current_input, &matches); | ||
for (auto &m : matches) { | ||
match_set.insert(m.value); | ||
} | ||
if (enable_correction_) { | ||
Corrections corrections; | ||
corrector_->ToleranceSearch(prism, current_input, &corrections, 5); | ||
for (const auto &m : corrections) { | ||
for (auto accessor = prism.QuerySpelling(m.first); !accessor.exhausted(); accessor.Next()) { | ||
if (accessor.properties().type == kNormalSpelling) { | ||
matches.push_back({ m.first, m.second.length }); | ||
break; | ||
} | ||
} | ||
} | ||
} | ||
|
||
if (!matches.empty()) { | ||
auto& end_vertices(graph->edges[current_pos]); | ||
for (const auto& m : matches) { | ||
|
@@ -56,15 +79,15 @@ int Syllabifier::BuildSyllableGraph(const string &input, | |
++end_pos; | ||
DLOG(INFO) << "end_pos: " << end_pos; | ||
bool matches_input = (current_pos == 0 && end_pos == input.length()); | ||
SpellingMap spellings; | ||
SpellingMap& spellings(end_vertices[end_pos]); | ||
SpellingType end_vertex_type = kInvalidSpelling; | ||
// when spelling algebra is enabled, | ||
// a spelling evaluates to a set of syllables; | ||
// otherwise, it resembles exactly the syllable itself. | ||
SpellingAccessor accessor(prism.QuerySpelling(m.value)); | ||
while (!accessor.exhausted()) { | ||
SyllableId syllable_id = accessor.syllable_id(); | ||
SpellingProperties props = accessor.properties(); | ||
EdgeProperties props(accessor.properties()); | ||
if (strict_spelling_ && | ||
matches_input && | ||
props.type != kNormalSpelling) { | ||
|
@@ -74,20 +97,29 @@ int Syllabifier::BuildSyllableGraph(const string &input, | |
props.end_pos = end_pos; | ||
// add a syllable with properties to the edge's | ||
// spelling-to-syllable map | ||
spellings.insert({syllable_id, props}); | ||
if (match_set.find(m.value) == match_set.end()) { | ||
props.is_correction = true; | ||
props.credibility = 0.01; | ||
lotem marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
auto it = spellings.find(syllable_id); | ||
if (it == spellings.end()) { | ||
spellings.insert({syllable_id, props}); | ||
} else { | ||
it->second.type = std::min(it->second.type, props.type); | ||
} | ||
// let end_vertex_type be the best (smaller) type of spelling | ||
// that ends at the vertex | ||
if (end_vertex_type > props.type) { | ||
if (end_vertex_type > props.type && !props.is_correction) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What if a position can only be reached via correction? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It should be kept as normal spelling. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh I see. End vertex of a correction edge should be marked as normal spelling even there is a worse typed edge overlapped. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
end_vertex_type = props.type; | ||
} | ||
} | ||
accessor.Next(); | ||
} | ||
if (spellings.empty()) { | ||
DLOG(INFO) << "not spelt."; | ||
end_vertices.erase(end_pos); | ||
continue; | ||
} | ||
end_vertices[end_pos].swap(spellings); | ||
// find the best common type in a path up to the end vertex | ||
// eg. pinyin "shurfa" has vertex type kNormalSpelling at position 3, | ||
// kAbbreviation at position 4 and kAbbreviation at position 6 | ||
|
@@ -121,6 +153,10 @@ int Syllabifier::BuildSyllableGraph(const string &input, | |
// when there is a path of more favored type | ||
SpellingType edge_type = kInvalidSpelling; | ||
for (auto k = j->second.begin(); k != j->second.end(); ) { | ||
if (k->second.is_correction) { | ||
++k; | ||
continue; // Don't care correction edges | ||
lotem marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
if (k->second.type > last_type) { | ||
j->second.erase(k++); | ||
} | ||
|
@@ -245,4 +281,9 @@ void Syllabifier::Transpose(SyllableGraph* graph) { | |
} | ||
} | ||
|
||
void Syllabifier::EnableCorrection(an<Corrector> corrector) { | ||
enable_correction_ = true; | ||
corrector_ = std::move(corrector); | ||
} | ||
|
||
} // namespace rime |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,15 +15,22 @@ | |
namespace rime { | ||
|
||
class Prism; | ||
class Corrector; | ||
|
||
using SyllableId = int32_t; | ||
|
||
using SpellingMap = map<SyllableId, SpellingProperties>; | ||
struct EdgeProperties : SpellingProperties { | ||
lotem marked this conversation as resolved.
Show resolved
Hide resolved
|
||
EdgeProperties(SpellingProperties sup): SpellingProperties(sup) {}; | ||
EdgeProperties() = default; | ||
bool is_correction = false; | ||
}; | ||
|
||
using SpellingMap = map<SyllableId, EdgeProperties>; | ||
using VertexMap = map<size_t, SpellingType>; | ||
using EndVertexMap = map<size_t, SpellingMap>; | ||
using EdgeMap = map<size_t, EndVertexMap>; | ||
|
||
using SpellingPropertiesList = vector<const SpellingProperties*>; | ||
using SpellingPropertiesList = vector<const EdgeProperties*>; | ||
using SpellingIndex = map<SyllableId, SpellingPropertiesList>; | ||
using SpellingIndices = map<size_t, SpellingIndex>; | ||
|
||
|
@@ -49,6 +56,7 @@ class Syllabifier { | |
RIME_API int BuildSyllableGraph(const string &input, | ||
Prism &prism, | ||
SyllableGraph *graph); | ||
RIME_API void EnableCorrection(an<Corrector> corrector); | ||
|
||
protected: | ||
void CheckOverlappedSpellings(SyllableGraph *graph, | ||
|
@@ -58,6 +66,8 @@ class Syllabifier { | |
string delimiters_; | ||
bool enable_completion_ = false; | ||
bool strict_spelling_ = false; | ||
an<Corrector> corrector_ = nullptr; | ||
bool enable_correction_ = false; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: this can be expressed by |
||
}; | ||
|
||
} // namespace rime | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,6 +9,7 @@ | |
#include <rime/resource.h> | ||
#include <rime/service.h> | ||
#include <rime/algo/algebra.h> | ||
#include <rime/gear/corrector.h> | ||
#include <rime/algo/utilities.h> | ||
#include <rime/dict/dictionary.h> | ||
#include <rime/dict/dict_compiler.h> | ||
|
@@ -212,7 +213,7 @@ bool DictCompiler::BuildPrism(const string &schema_file, | |
Syllabary syllabary; | ||
if (!table_->Load() || !table_->GetSyllabary(&syllabary) || syllabary.empty()) | ||
return false; | ||
// apply spelling algebra | ||
// apply spelling algebra and prepare corrections (if enabled) | ||
Script script; | ||
if (!schema_file.empty()) { | ||
Config config; | ||
|
@@ -230,6 +231,26 @@ bool DictCompiler::BuildPrism(const string &schema_file, | |
script.clear(); | ||
} | ||
} | ||
|
||
#if 0 | ||
// build corrector | ||
bool enable_correction = false; // Avoid if initializer to comfort compilers | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: be consistent about capitalizing phrase/sentence in code comments. the previous line is not capitalized. |
||
if (config.GetBool("translator/enable_correction", &enable_correction) && | ||
enable_correction) { | ||
boost::filesystem::path corrector_path(prism_->file_name()); | ||
corrector_path.replace_extension(""); | ||
corrector_path.replace_extension(".correction.bin"); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
correction_ = New<EditDistanceCorrector>(RelocateToUserDirectory(prefix_, corrector_path.string())); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: name it |
||
if (correction_->Exists()) { | ||
correction_->Remove(); | ||
} | ||
if (!correction_->Build(syllabary, &script, | ||
dict_file_checksum, schema_file_checksum) || | ||
!correction_->Save()) { | ||
return false; | ||
} | ||
} | ||
#endif | ||
} | ||
if ((options_ & kDump) && !script.empty()) { | ||
boost::filesystem::path path(prism_->file_name()); | ||
|
@@ -239,12 +260,13 @@ bool DictCompiler::BuildPrism(const string &schema_file, | |
// build .prism.bin | ||
{ | ||
prism_->Remove(); | ||
if (!prism_->Build(syllabary, script.empty() ? NULL : &script, | ||
if (!prism_->Build(syllabary, script.empty() ? nullptr : &script, | ||
dict_file_checksum, schema_file_checksum) || | ||
!prism_->Save()) { | ||
return false; | ||
} | ||
} | ||
|
||
return true; | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,6 +13,7 @@ | |
#include <rime/ticket.h> | ||
#include <rime/dict/dictionary.h> | ||
#include <rime/algo/syllabifier.h> | ||
#include <rime/gear/corrector.h> | ||
|
||
namespace rime { | ||
|
||
|
@@ -147,9 +148,9 @@ bool DictEntryIterator::Skip(size_t num_entries) { | |
// Dictionary members | ||
|
||
Dictionary::Dictionary(const string& name, | ||
const an<Table>& table, | ||
const an<Prism>& prism) | ||
: name_(name), table_(table), prism_(prism) { | ||
an<Table> table, | ||
an<Prism> prism) | ||
: name_(name), table_(std::move(table)), prism_(std::move(prism)) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't understand how move semantics works here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This may be done automatically by clang-tidy. |
||
} | ||
|
||
Dictionary::~Dictionary() { | ||
|
@@ -292,23 +293,22 @@ DictionaryComponent::DictionaryComponent() | |
: prism_resource_resolver_( | ||
Service::instance().CreateResourceResolver(kPrismResourceType)), | ||
table_resource_resolver_( | ||
Service::instance().CreateResourceResolver(kTableResourceType)) { | ||
} | ||
Service::instance().CreateResourceResolver(kTableResourceType)) {} | ||
|
||
DictionaryComponent::~DictionaryComponent() { | ||
} | ||
|
||
Dictionary* DictionaryComponent::Create(const Ticket& ticket) { | ||
if (!ticket.schema) return NULL; | ||
if (!ticket.schema) return nullptr; | ||
Config* config = ticket.schema->config(); | ||
string dict_name; | ||
if (!config->GetString(ticket.name_space + "/dictionary", &dict_name)) { | ||
LOG(ERROR) << ticket.name_space << "/dictionary not specified in schema '" | ||
<< ticket.schema->schema_id() << "'."; | ||
return NULL; | ||
return nullptr; | ||
} | ||
if (dict_name.empty()) { | ||
return NULL; // not requiring static dictionary | ||
return nullptr; // not requiring static dictionary | ||
} | ||
string prism_name; | ||
if (!config->GetString(ticket.name_space + "/prism", &prism_name)) { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
dependency graph of possible split modules:
gear
depends onalgo
, what happens here is a circular dependency.Is it easy to move
corrector
toalgo
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It was moved to
gear
because it is registered as a component ingears_module.cc
and there is not aalgo_module.cc
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How about moving
corrector
tosrc/rime/dict
, and register the component indict_module.cc
?