Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: spelling correction #228

Merged
merged 25 commits into from
Dec 14, 2018
Merged
Show file tree
Hide file tree
Changes from 23 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 48 additions & 7 deletions src/rime/algo/syllabifier.cc
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,11 @@
#include <boost/range/adaptor/reversed.hpp>
#include <rime/dict/prism.h>
#include <rime/algo/syllabifier.h>
#include <rime/gear/corrector.h>
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

dependency graph of possible split modules:

core <-- {algo, dict} <-- {gear, lever}

gear depends on algo, what happens here is a circular dependency.
Is it easy to move corrector to algo?

Copy link
Member Author

@nameoverflow nameoverflow Nov 28, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It was moved to gear because it is registered as a component in gears_module.cc and there is not a algo_module.cc

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about moving corrector to src/rime/dict, and register the component in dict_module.cc ?

#include "syllabifier.h"

namespace rime {
using namespace corrector;

using Vertex = pair<size_t, SpellingType>;
using VertexQueue = std::priority_queue<Vertex,
Expand All @@ -35,16 +38,36 @@ int Syllabifier::BuildSyllableGraph(const string &input,
// record a visit to the vertex
if (graph->vertices.find(current_pos) == graph->vertices.end())
graph->vertices.insert(vertex); // preferred spelling type comes first
else
else {
// graph->vertices[current_pos] = std::min(vertex.second, graph->vertices[current_pos]);
continue; // discard worse spelling types
}

if (current_pos > farthest)
farthest = current_pos;
DLOG(INFO) << "current_pos: " << current_pos;

// see where we can go by advancing a syllable
vector<Prism::Match> matches;
prism.CommonPrefixSearch(input.substr(current_pos), &matches);
set<SyllableId> match_set;
auto current_input = input.substr(current_pos);
prism.CommonPrefixSearch(current_input, &matches);
for (auto &m : matches) {
match_set.insert(m.value);
}
if (enable_correction_) {
Corrections corrections;
corrector_->ToleranceSearch(prism, current_input, &corrections, 5);
for (const auto &m : corrections) {
for (auto accessor = prism.QuerySpelling(m.first); !accessor.exhausted(); accessor.Next()) {
if (accessor.properties().type == kNormalSpelling) {
matches.push_back({ m.first, m.second.length });
break;
}
}
}
}

if (!matches.empty()) {
auto& end_vertices(graph->edges[current_pos]);
for (const auto& m : matches) {
Expand All @@ -56,15 +79,15 @@ int Syllabifier::BuildSyllableGraph(const string &input,
++end_pos;
DLOG(INFO) << "end_pos: " << end_pos;
bool matches_input = (current_pos == 0 && end_pos == input.length());
SpellingMap spellings;
SpellingMap& spellings(end_vertices[end_pos]);
SpellingType end_vertex_type = kInvalidSpelling;
// when spelling algebra is enabled,
// a spelling evaluates to a set of syllables;
// otherwise, it resembles exactly the syllable itself.
SpellingAccessor accessor(prism.QuerySpelling(m.value));
while (!accessor.exhausted()) {
SyllableId syllable_id = accessor.syllable_id();
SpellingProperties props = accessor.properties();
EdgeProperties props(accessor.properties());
if (strict_spelling_ &&
matches_input &&
props.type != kNormalSpelling) {
Expand All @@ -74,20 +97,29 @@ int Syllabifier::BuildSyllableGraph(const string &input,
props.end_pos = end_pos;
// add a syllable with properties to the edge's
// spelling-to-syllable map
spellings.insert({syllable_id, props});
if (match_set.find(m.value) == match_set.end()) {
props.is_correction = true;
props.credibility = 0.01;
lotem marked this conversation as resolved.
Show resolved Hide resolved
}
auto it = spellings.find(syllable_id);
if (it == spellings.end()) {
spellings.insert({syllable_id, props});
} else {
it->second.type = std::min(it->second.type, props.type);
}
// let end_vertex_type be the best (smaller) type of spelling
// that ends at the vertex
if (end_vertex_type > props.type) {
if (end_vertex_type > props.type && !props.is_correction) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What if a position can only be reached via correction?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It should be kept as normal spelling.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh I see. End vertex of a correction edge should be marked as normal spelling even there is a worse typed edge overlapped.

Copy link
Member Author

@nameoverflow nameoverflow Nov 28, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And the worse overlapped edge should be deleted?
OK. It should keep the behavior as there isn't a correction.

end_vertex_type = props.type;
}
}
accessor.Next();
}
if (spellings.empty()) {
DLOG(INFO) << "not spelt.";
end_vertices.erase(end_pos);
continue;
}
end_vertices[end_pos].swap(spellings);
// find the best common type in a path up to the end vertex
// eg. pinyin "shurfa" has vertex type kNormalSpelling at position 3,
// kAbbreviation at position 4 and kAbbreviation at position 6
Expand Down Expand Up @@ -121,6 +153,10 @@ int Syllabifier::BuildSyllableGraph(const string &input,
// when there is a path of more favored type
SpellingType edge_type = kInvalidSpelling;
for (auto k = j->second.begin(); k != j->second.end(); ) {
if (k->second.is_correction) {
++k;
continue; // Don't care correction edges
lotem marked this conversation as resolved.
Show resolved Hide resolved
}
if (k->second.type > last_type) {
j->second.erase(k++);
}
Expand Down Expand Up @@ -245,4 +281,9 @@ void Syllabifier::Transpose(SyllableGraph* graph) {
}
}

void Syllabifier::EnableCorrection(an<Corrector> corrector) {
enable_correction_ = true;
corrector_ = std::move(corrector);
}

} // namespace rime
14 changes: 12 additions & 2 deletions src/rime/algo/syllabifier.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,22 @@
namespace rime {

class Prism;
class Corrector;

using SyllableId = int32_t;

using SpellingMap = map<SyllableId, SpellingProperties>;
struct EdgeProperties : SpellingProperties {
lotem marked this conversation as resolved.
Show resolved Hide resolved
EdgeProperties(SpellingProperties sup): SpellingProperties(sup) {};
EdgeProperties() = default;
bool is_correction = false;
};

using SpellingMap = map<SyllableId, EdgeProperties>;
using VertexMap = map<size_t, SpellingType>;
using EndVertexMap = map<size_t, SpellingMap>;
using EdgeMap = map<size_t, EndVertexMap>;

using SpellingPropertiesList = vector<const SpellingProperties*>;
using SpellingPropertiesList = vector<const EdgeProperties*>;
using SpellingIndex = map<SyllableId, SpellingPropertiesList>;
using SpellingIndices = map<size_t, SpellingIndex>;

Expand All @@ -49,6 +56,7 @@ class Syllabifier {
RIME_API int BuildSyllableGraph(const string &input,
Prism &prism,
SyllableGraph *graph);
RIME_API void EnableCorrection(an<Corrector> corrector);

protected:
void CheckOverlappedSpellings(SyllableGraph *graph,
Expand All @@ -58,6 +66,8 @@ class Syllabifier {
string delimiters_;
bool enable_completion_ = false;
bool strict_spelling_ = false;
an<Corrector> corrector_ = nullptr;
bool enable_correction_ = false;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: this can be expressed by bool(corrector_)

};

} // namespace rime
Expand Down
2 changes: 2 additions & 0 deletions src/rime/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include <unordered_set>
#include <utility>
#include <vector>
#include <boost/optional.hpp>
#define BOOST_BIND_NO_PLACEHOLDERS
#ifdef BOOST_SIGNALS2
#include <boost/signals2/connection.hpp>
Expand Down Expand Up @@ -47,6 +48,7 @@ using std::pair;
using std::set;
using std::string;
using std::vector;
using boost::optional;

template <class Key, class T>
using hash_map = std::unordered_map<Key, T>;
Expand Down
26 changes: 24 additions & 2 deletions src/rime/dict/dict_compiler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include <rime/resource.h>
#include <rime/service.h>
#include <rime/algo/algebra.h>
#include <rime/gear/corrector.h>
#include <rime/algo/utilities.h>
#include <rime/dict/dictionary.h>
#include <rime/dict/dict_compiler.h>
Expand Down Expand Up @@ -212,7 +213,7 @@ bool DictCompiler::BuildPrism(const string &schema_file,
Syllabary syllabary;
if (!table_->Load() || !table_->GetSyllabary(&syllabary) || syllabary.empty())
return false;
// apply spelling algebra
// apply spelling algebra and prepare corrections (if enabled)
Script script;
if (!schema_file.empty()) {
Config config;
Expand All @@ -230,6 +231,26 @@ bool DictCompiler::BuildPrism(const string &schema_file,
script.clear();
}
}

#if 0
// build corrector
bool enable_correction = false; // Avoid if initializer to comfort compilers
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: be consistent about capitalizing phrase/sentence in code comments. the previous line is not capitalized.

if (config.GetBool("translator/enable_correction", &enable_correction) &&
enable_correction) {
boost::filesystem::path corrector_path(prism_->file_name());
corrector_path.replace_extension("");
corrector_path.replace_extension(".correction.bin");
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

".correct.bin" which is shorter and matches ".reverse.bin". Yes, "correct" is an adjective.

correction_ = New<EditDistanceCorrector>(RelocateToUserDirectory(prefix_, corrector_path.string()));
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: name it correction_dict or corrector (this is okay because the variable table can refer to either table builder or table reader in different circumstances)

if (correction_->Exists()) {
correction_->Remove();
}
if (!correction_->Build(syllabary, &script,
dict_file_checksum, schema_file_checksum) ||
!correction_->Save()) {
return false;
}
}
#endif
}
if ((options_ & kDump) && !script.empty()) {
boost::filesystem::path path(prism_->file_name());
Expand All @@ -239,12 +260,13 @@ bool DictCompiler::BuildPrism(const string &schema_file,
// build .prism.bin
{
prism_->Remove();
if (!prism_->Build(syllabary, script.empty() ? NULL : &script,
if (!prism_->Build(syllabary, script.empty() ? nullptr : &script,
dict_file_checksum, schema_file_checksum) ||
!prism_->Save()) {
return false;
}
}

return true;
}

Expand Down
2 changes: 2 additions & 0 deletions src/rime/dict/dict_compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ class Prism;
class Table;
class ReverseDb;
class DictSettings;
class EditDistanceCorrector;

class DictCompiler {
public:
Expand All @@ -43,6 +44,7 @@ class DictCompiler {

string dict_name_;
an<Prism> prism_;
an<EditDistanceCorrector> correction_;
an<Table> table_;
int options_ = 0;
string prefix_;
Expand Down
16 changes: 8 additions & 8 deletions src/rime/dict/dictionary.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include <rime/ticket.h>
#include <rime/dict/dictionary.h>
#include <rime/algo/syllabifier.h>
#include <rime/gear/corrector.h>

namespace rime {

Expand Down Expand Up @@ -147,9 +148,9 @@ bool DictEntryIterator::Skip(size_t num_entries) {
// Dictionary members

Dictionary::Dictionary(const string& name,
const an<Table>& table,
const an<Prism>& prism)
: name_(name), table_(table), prism_(prism) {
an<Table> table,
an<Prism> prism)
: name_(name), table_(std::move(table)), prism_(std::move(prism)) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand how move semantics works here.
reminder: an<T> is short for shared_ptr<T>

Copy link
Member Author

@nameoverflow nameoverflow Nov 28, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This may be done automatically by clang-tidy.
However my personal preference is to pass by value and move to initializer.

}

Dictionary::~Dictionary() {
Expand Down Expand Up @@ -292,23 +293,22 @@ DictionaryComponent::DictionaryComponent()
: prism_resource_resolver_(
Service::instance().CreateResourceResolver(kPrismResourceType)),
table_resource_resolver_(
Service::instance().CreateResourceResolver(kTableResourceType)) {
}
Service::instance().CreateResourceResolver(kTableResourceType)) {}

DictionaryComponent::~DictionaryComponent() {
}

Dictionary* DictionaryComponent::Create(const Ticket& ticket) {
if (!ticket.schema) return NULL;
if (!ticket.schema) return nullptr;
Config* config = ticket.schema->config();
string dict_name;
if (!config->GetString(ticket.name_space + "/dictionary", &dict_name)) {
LOG(ERROR) << ticket.name_space << "/dictionary not specified in schema '"
<< ticket.schema->schema_id() << "'.";
return NULL;
return nullptr;
}
if (dict_name.empty()) {
return NULL; // not requiring static dictionary
return nullptr; // not requiring static dictionary
}
string prism_name;
if (!config->GetString(ticket.name_space + "/prism", &prism_name)) {
Expand Down
9 changes: 5 additions & 4 deletions src/rime/dict/dictionary.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,14 +70,15 @@ struct DictEntryCollector : map<size_t, DictEntryIterator> {

class Config;
class Schema;
class EditDistanceCorrector;
struct SyllableGraph;
struct Ticket;

class Dictionary : public Class<Dictionary, const Ticket&> {
public:
RIME_API Dictionary(const string& name,
const an<Table>& table,
const an<Prism>& prism);
an<Table> table,
an<Prism> prism);
virtual ~Dictionary();

bool Exists() const;
Expand Down Expand Up @@ -113,8 +114,8 @@ class ResourceResolver;
class DictionaryComponent : public Dictionary::Component {
public:
DictionaryComponent();
~DictionaryComponent();
Dictionary* Create(const Ticket& ticket);
~DictionaryComponent() override;
Dictionary* Create(const Ticket& ticket) override;
Dictionary* CreateDictionaryWithName(const string& dict_name,
const string& prism_name);

Expand Down
3 changes: 1 addition & 2 deletions src/rime/dict/prism.cc
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,6 @@ bool Prism::Save() {
}
return ShrinkToFit();
}

bool Prism::Build(const Syllabary& syllabary,
const Script* script,
uint32_t dict_file_checksum,
Expand Down Expand Up @@ -240,7 +239,7 @@ bool Prism::HasKey(const string& key) {
return value != -1;
}

bool Prism::GetValue(const string& key, int* value) {
bool Prism::GetValue(const string& key, int* value) const {
int result = trie_->exactMatchSearch<int>(key.c_str());
if (result == -1) {
return false;
Expand Down
7 changes: 4 additions & 3 deletions src/rime/dict/prism.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,12 +72,12 @@ class Prism : public MappedFile {
RIME_API bool Load();
RIME_API bool Save();
RIME_API bool Build(const Syllabary& syllabary,
const Script* script = NULL,
const Script* script = nullptr,
uint32_t dict_file_checksum = 0,
uint32_t schema_file_checksum = 0);

RIME_API bool HasKey(const string& key);
RIME_API bool GetValue(const string& key, int* value);
RIME_API bool GetValue(const string& key, int* value) const;
RIME_API void CommonPrefixSearch(const string& key, vector<Match>* result);
RIME_API void ExpandSearch(const string& key, vector<Match>* result, size_t limit);
SpellingAccessor QuerySpelling(SyllableId spelling_id);
Expand All @@ -86,8 +86,9 @@ class Prism : public MappedFile {

uint32_t dict_file_checksum() const;
uint32_t schema_file_checksum() const;
Darts::DoubleArray& trie() const { return *trie_; }

private:
protected:
the<Darts::DoubleArray> trie_;
prism::Metadata* metadata_ = nullptr;
prism::SpellingMap* spelling_map_ = nullptr;
Expand Down
Loading