diff --git a/CODEOWNERS b/CODEOWNERS index d0bb43ec8b8..fd381316f2e 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -22,7 +22,6 @@ experimental/collator/ @hsivonen @echeran experimental/normalizer/ @hsivonen @echeran experimental/provider_ppucd/ @echeran experimental/segmenter/ @aethanyc @makotokato -experimental/segmenter_lstm/ @aethanyc @sffc ffi/capi/ @Manishearth ffi/cpp/ @Manishearth ffi/ecma402/ @filmil diff --git a/Cargo.lock b/Cargo.lock index 40dcc215f1d..d022da279d4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1814,26 +1814,16 @@ dependencies = [ "icu_char16trie", "icu_codepointtrie", "icu_provider", - "icu_segmenter_lstm", "icu_testdata", "lazy_static", - "serde", - "serde-json-core", - "serde_json", - "zerovec", -] - -[[package]] -name = "icu_segmenter_lstm" -version = "0.1.0" -dependencies = [ - "icu_provider", "litemap", "ndarray", + "num-traits", "serde", + "serde-json-core", "serde_json", "unicode-segmentation", - "yoke", + "zerovec", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 977c04cb86a..d51f49fdcf5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,7 +23,6 @@ members = [ "experimental/collator", "experimental/normalizer", "experimental/segmenter", - "experimental/segmenter_lstm", "ffi/capi_cdylib", "ffi/diplomat", "ffi/capi_staticlib", diff --git a/experimental/segmenter/Cargo.toml b/experimental/segmenter/Cargo.toml index b7ff885240c..cca9b0493eb 100644 --- a/experimental/segmenter/Cargo.toml +++ b/experimental/segmenter/Cargo.toml @@ -31,12 +31,15 @@ skip_optional_dependencies = true icu_char16trie = { version = "0.1", path = "../char16trie" } icu_codepointtrie = { path = "../../utils/codepointtrie" } icu_provider = { version = "0.6", path = "../../provider/core", features = ["macros"] } -icu_segmenter_lstm = { version = "0.1", path = "../segmenter_lstm", optional = true } serde = { version = "1.0", default-features = false, features = ["derive", "alloc"], optional = true } serde_json = { version = "1.0", default-features = false, features = ["alloc"] } lazy_static = { version = "1.0", features = ["spin_no_std"] } zerovec = { version = "0.7", path = "../../utils/zerovec", features = ["yoke"] } crabbake = { version = "0.4", path = "../../experimental/crabbake", optional = true, features = ["derive"] } +litemap = { version = "0.4.0", path = "../../utils/litemap", optional = true, features = ["serde"] } +ndarray = { git = "https://github.com/rust-ndarray/ndarray", rev = "31244100631382bb8ee30721872a928bfdf07f44", default-features = false, optional = true, features = ["serde"] } +unicode-segmentation = { version = "1.3.0", optional = true } +num-traits = { version = "0.2", optional = true } [dev-dependencies] criterion = "0.3" @@ -57,6 +60,6 @@ required-features = ["lstm"] [features] default = [] -lstm = ["icu_segmenter_lstm"] +lstm = ["litemap", "ndarray", "num-traits", "serde", "unicode-segmentation"] serde = ["dep:serde", "zerovec/serde", "icu_codepointtrie/serde"] datagen = ["serde", "crabbake", "zerovec/crabbake", "icu_codepointtrie/crabbake"] diff --git a/experimental/segmenter/src/lib.rs b/experimental/segmenter/src/lib.rs index 927713c8f5c..1ab00826e2f 100644 --- a/experimental/segmenter/src/lib.rs +++ b/experimental/segmenter/src/lib.rs @@ -157,6 +157,14 @@ extern crate lazy_static; // Use the LSTM when the feature is enabled. #[cfg(feature = "lstm")] mod lstm; +#[cfg(feature = "lstm")] +mod lstm_bies; +#[cfg(feature = "lstm")] +mod lstm_error; +#[cfg(feature = "lstm")] +mod lstm_structs; +#[cfg(feature = "lstm")] +mod math_helper; pub use crate::dictionary::{DictionaryBreakIterator, DictionarySegmenter}; pub use crate::grapheme::{ @@ -167,6 +175,8 @@ pub use crate::line::{ Latin1Char, LineBreakIterator, LineBreakOptions, LineBreakRule, LineBreakSegmenter, Utf16Char, WordBreakRule, }; +#[cfg(feature = "lstm")] +pub use crate::lstm_structs::LstmDataMarker; pub use crate::provider::{ GraphemeClusterBreakDataV1Marker, LineBreakDataV1Marker, RuleBreakDataV1, RuleBreakPropertyTable, RuleBreakStateTable, SentenceBreakDataV1Marker, diff --git a/experimental/segmenter/src/lstm.rs b/experimental/segmenter/src/lstm.rs index d389e831d3a..a112af9c3eb 100644 --- a/experimental/segmenter/src/lstm.rs +++ b/experimental/segmenter/src/lstm.rs @@ -3,14 +3,14 @@ // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). use crate::language::*; +use crate::lstm_bies::Lstm; +use crate::lstm_structs::{LstmData, LstmDataMarker}; use alloc::string::String; use alloc::string::ToString; use core::char::decode_utf16; use icu_provider::DataError; use icu_provider::DataPayload; -use icu_segmenter_lstm::lstm::Lstm; -use icu_segmenter_lstm::structs; // TODO: // json file is big, So I should use anoher binary format like npy. @@ -21,14 +21,14 @@ const BURMESE_MODEL: &[u8; 475209] = include_bytes!("../tests/testdata/json/core/segmenter_lstm@1/my.json"); lazy_static! { - static ref THAI_LSTM: structs::LstmData<'static> = + static ref THAI_LSTM: LstmData<'static> = serde_json::from_slice(THAI_MODEL).expect("JSON syntax error"); - static ref BURMESE_LSTM: structs::LstmData<'static> = + static ref BURMESE_LSTM: LstmData<'static> = serde_json::from_slice(BURMESE_MODEL).expect("JSON syntax error"); } // LSTM model depends on language, So we have to switch models per language. -pub fn get_best_lstm_model(codepoint: u32) -> Option> { +pub fn get_best_lstm_model(codepoint: u32) -> Option> { let lang = get_language(codepoint); match lang { Language::Thai => Some(DataPayload::from_owned(THAI_LSTM.clone())), @@ -111,7 +111,7 @@ pub struct LstmSegmenter { } impl LstmSegmenter { - pub fn try_new(payload: DataPayload) -> Result { + pub fn try_new(payload: DataPayload) -> Result { let lstm = Lstm::try_new(payload).unwrap(); Ok(Self { lstm }) diff --git a/experimental/segmenter_lstm/src/lstm.rs b/experimental/segmenter/src/lstm_bies.rs similarity index 66% rename from experimental/segmenter_lstm/src/lstm.rs rename to experimental/segmenter/src/lstm_bies.rs index fc57efe828f..39185e166aa 100644 --- a/experimental/segmenter_lstm/src/lstm.rs +++ b/experimental/segmenter/src/lstm_bies.rs @@ -2,9 +2,9 @@ // called LICENSE at the top level of the ICU4X source tree // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). -use crate::error::Error; +use crate::lstm_error::Error; +use crate::lstm_structs::LstmDataMarker; use crate::math_helper; -use crate::structs; use alloc::string::{String, ToString}; use alloc::vec::Vec; use core::str; @@ -13,12 +13,12 @@ use ndarray::{Array1, Array2, ArrayBase, Dim, ViewRepr}; use unicode_segmentation::UnicodeSegmentation; pub struct Lstm { - data: DataPayload, + data: DataPayload, } impl Lstm { /// `try_new` is the initiator of struct `Lstm` - pub fn try_new(data: DataPayload) -> Result { + pub fn try_new(data: DataPayload) -> Result { if data.get().dic.len() > core::i16::MAX as usize { return Err(Error::Limit); } @@ -43,6 +43,7 @@ impl Lstm { } /// `get_model_name` returns the name of the LSTM model. + #[allow(dead_code)] pub fn get_model_name(&self) -> &str { &self.data.get().model } @@ -168,3 +169,94 @@ impl Lstm { bies } } + +#[cfg(test)] +mod tests { + use super::*; + use serde::{Deserialize, Serialize}; + use std::fs::File; + use std::io::BufReader; + + /// `TestCase` is a struct used to store a single test case. + /// Each test case has two attributs: `unseg` which denots the unsegmented line, and `true_bies` which indicates the Bies + /// sequence representing the true segmentation. + #[derive(PartialEq, Debug, Serialize, Deserialize)] + pub struct TestCase { + pub unseg: String, + pub expected_bies: String, + pub true_bies: String, + } + + /// `TestTextData` is a struct to store a vector of `TestCase` that represents a test text. + #[derive(PartialEq, Debug, Serialize, Deserialize)] + pub struct TestTextData { + pub testcases: Vec, + } + + #[derive(Debug)] + pub struct TestText { + pub data: TestTextData, + } + + impl TestText { + pub fn new(data: TestTextData) -> Self { + Self { data } + } + } + + fn load_lstm_data(filename: &str) -> DataPayload { + DataPayload::::try_from_rc_buffer_badly( + std::fs::read(filename) + .expect("File can read to end") + .into(), + |bytes| serde_json::from_slice(bytes), + ) + .expect("JSON syntax error") + } + + fn load_test_text(filename: &str) -> TestTextData { + let file = File::open(filename).expect("File should be present"); + let reader = BufReader::new(file); + serde_json::from_reader(reader).expect("JSON syntax error") + } + + #[test] + fn test_model_loading() { + let filename = "tests/testdata/Thai_graphclust_exclusive_model4_heavy/weights.json"; + let lstm_data = load_lstm_data(filename); + let lstm = Lstm::try_new(lstm_data).unwrap(); + assert_eq!( + lstm.get_model_name(), + String::from("Thai_graphclust_exclusive_model4_heavy") + ); + } + + #[test] + fn segment_file_by_lstm() { + // Choosing the embedding system. It can be "graphclust" or "codepoints". + let embedding: &str = "codepoints"; + let mut model_filename = "tests/testdata/Thai_".to_owned(); + model_filename.push_str(embedding); + model_filename.push_str("_exclusive_model4_heavy/weights.json"); + let lstm_data = load_lstm_data(&model_filename); + let lstm = Lstm::try_new(lstm_data).unwrap(); + + // Importing the test data + let mut test_text_filename = "tests/testdata/test_text_".to_owned(); + test_text_filename.push_str(embedding); + test_text_filename.push_str(".json"); + let test_text_data = load_test_text(&test_text_filename); + let test_text = TestText::new(test_text_data); + + // Testing + for test_case in test_text.data.testcases { + let lstm_output = lstm.word_segmenter(&test_case.unseg); + println!("Test case : {}", test_case.unseg); + println!("Expected bies : {}", test_case.expected_bies); + println!("Estimated bies : {}", lstm_output); + println!("True bies : {}", test_case.true_bies); + println!("****************************************************"); + assert_eq!(test_case.expected_bies, lstm_output); + } + } +} diff --git a/experimental/segmenter_lstm/src/error.rs b/experimental/segmenter/src/lstm_error.rs similarity index 100% rename from experimental/segmenter_lstm/src/error.rs rename to experimental/segmenter/src/lstm_error.rs diff --git a/experimental/segmenter_lstm/src/structs.rs b/experimental/segmenter/src/lstm_structs.rs similarity index 100% rename from experimental/segmenter_lstm/src/structs.rs rename to experimental/segmenter/src/lstm_structs.rs diff --git a/experimental/segmenter_lstm/src/math_helper.rs b/experimental/segmenter/src/math_helper.rs similarity index 100% rename from experimental/segmenter_lstm/src/math_helper.rs rename to experimental/segmenter/src/math_helper.rs diff --git a/experimental/segmenter_lstm/tests/testdata/Thai_codepoints_exclusive_model4_heavy/weights.json b/experimental/segmenter/tests/testdata/Thai_codepoints_exclusive_model4_heavy/weights.json similarity index 100% rename from experimental/segmenter_lstm/tests/testdata/Thai_codepoints_exclusive_model4_heavy/weights.json rename to experimental/segmenter/tests/testdata/Thai_codepoints_exclusive_model4_heavy/weights.json diff --git a/experimental/segmenter_lstm/tests/testdata/Thai_graphclust_exclusive_model4_heavy/weights.json b/experimental/segmenter/tests/testdata/Thai_graphclust_exclusive_model4_heavy/weights.json similarity index 100% rename from experimental/segmenter_lstm/tests/testdata/Thai_graphclust_exclusive_model4_heavy/weights.json rename to experimental/segmenter/tests/testdata/Thai_graphclust_exclusive_model4_heavy/weights.json diff --git a/experimental/segmenter_lstm/tests/testdata/test_text_codepoints.json b/experimental/segmenter/tests/testdata/test_text_codepoints.json similarity index 100% rename from experimental/segmenter_lstm/tests/testdata/test_text_codepoints.json rename to experimental/segmenter/tests/testdata/test_text_codepoints.json diff --git a/experimental/segmenter_lstm/tests/testdata/test_text_graphclust.json b/experimental/segmenter/tests/testdata/test_text_graphclust.json similarity index 100% rename from experimental/segmenter_lstm/tests/testdata/test_text_graphclust.json rename to experimental/segmenter/tests/testdata/test_text_graphclust.json diff --git a/experimental/segmenter_lstm/Cargo.toml b/experimental/segmenter_lstm/Cargo.toml deleted file mode 100644 index 91ab5d8509a..00000000000 --- a/experimental/segmenter_lstm/Cargo.toml +++ /dev/null @@ -1,45 +0,0 @@ -# This file is part of ICU4X. For terms of use, please see the file -# called LICENSE at the top level of the ICU4X source tree -# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). - -[package] -name = "icu_segmenter_lstm" -description = "LSTM-based segmenter for dictionary-based languages" -version = "0.1.0" -authors = ["The ICU4X Project Developers"] -edition = "2018" -readme = "README.md" -repository = "https://github.com/unicode-org/icu4x" -license-file = "LICENSE" -categories = ["internationalization"] -# Keep this in sync with other crates unless there are exceptions -include = [ - "src/**/*", - "examples/**/*", - "benches/**/*", - "tests/**/*", - "Cargo.toml", - "LICENSE", - "README.md" -] - -[dependencies] -icu_provider = { version = "0.6", path = "../../provider/core", features = ["macros"] } -litemap = { version = "0.4.0", path = "../../utils/litemap", features = ["serde"] } -serde = { version = "1.0", default-features = false, features = ["derive", "alloc"] } -serde_json = { version = "1.0", default-features = false, features = ["alloc"] } -ndarray = { git = "https://github.com/rust-ndarray/ndarray", rev = "31244100631382bb8ee30721872a928bfdf07f44", features = ["serde"] } -unicode-segmentation = "1.3.0" -yoke = { version = "0.6.0", path = "../../utils/yoke", features = ["derive"] } - -[dev-dependencies] -serde_json = "1.0" -# criterion = "0.3.3" - -[lib] -bench = false # This option is required for Benchmark CI -path = "src/lib.rs" - -[features] -default = [] -bench = [] diff --git a/experimental/segmenter_lstm/LICENSE b/experimental/segmenter_lstm/LICENSE deleted file mode 100644 index 5ab1f57507b..00000000000 --- a/experimental/segmenter_lstm/LICENSE +++ /dev/null @@ -1,331 +0,0 @@ -Except as otherwise noted below, ICU4X is licensed under the Apache -License, Version 2.0 (included below) or the MIT license (included -below), at your option. Unless importing data or code in the manner -stated below, any contribution intentionally submitted for inclusion -in ICU4X by you, as defined in the Apache-2.0 license, shall be dual -licensed in the foregoing manner, without any additional terms or -conditions. - -As exceptions to the above: -* Portions of ICU4X that have been adapted from ICU4C and/or ICU4J are -under the Unicode license (included below) and/or the ICU license -(included below) as indicated by source code comments. -* Unicode data incorporated in ICU4X is under the Unicode license -(included below). -* Your contributions may import code from ICU4C and/or ICU4J and -Unicode data under these licenses. Indicate the license and the ICU4C -or ICU4J origin in source code comments. - -- - - - - -Apache License, version 2.0 - - - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - -- - - - - -MIT License - -Copyright The ICU4X Authors - -Permission is hereby granted, free of charge, to any -person obtaining a copy of this software and associated -documentation files (the "Software"), to deal in the -Software without restriction, including without -limitation the rights to use, copy, modify, merge, -publish, distribute, sublicense, and/or sell copies of -the Software, and to permit persons to whom the Software -is furnished to do so, subject to the following -conditions: - -The above copyright notice and this permission notice -shall be included in all copies or substantial portions -of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF -ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED -TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A -PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT -SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY -CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION -OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR -IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. - -- - - - - -Unicode License - -COPYRIGHT AND PERMISSION NOTICE (ICU 58 and later) - -Copyright © 1991-2020 Unicode, Inc. All rights reserved. -Distributed under the Terms of Use in https://www.unicode.org/copyright.html. - -Permission is hereby granted, free of charge, to any person obtaining -a copy of the Unicode data files and any associated documentation -(the "Data Files") or Unicode software and any associated documentation -(the "Software") to deal in the Data Files or Software -without restriction, including without limitation the rights to use, -copy, modify, merge, publish, distribute, and/or sell copies of -the Data Files or Software, and to permit persons to whom the Data Files -or Software are furnished to do so, provided that either -(a) this copyright and permission notice appear with all copies -of the Data Files or Software, or -(b) this copyright and permission notice appear in associated -Documentation. - -THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF -ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -NONINFRINGEMENT OF THIRD PARTY RIGHTS. -IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS -NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL -DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, -DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER -TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR -PERFORMANCE OF THE DATA FILES OR SOFTWARE. - -Except as contained in this notice, the name of a copyright holder -shall not be used in advertising or otherwise to promote the sale, -use or other dealings in these Data Files or Software without prior -written authorization of the copyright holder. - -- - - - - -ICU License - ICU 1.8.1 to ICU 57.1 - -COPYRIGHT AND PERMISSION NOTICE - -Copyright (c) 1995-2016 International Business Machines Corporation and others -All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining -a copy of this software and associated documentation files (the -"Software"), to deal in the Software without restriction, including -without limitation the rights to use, copy, modify, merge, publish, -distribute, and/or sell copies of the Software, and to permit persons -to whom the Software is furnished to do so, provided that the above -copyright notice(s) and this permission notice appear in all copies of -the Software and that both the above copyright notice(s) and this -permission notice appear in supporting documentation. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT -OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR -HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY -SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER -RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF -CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN -CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - -Except as contained in this notice, the name of a copyright holder -shall not be used in advertising or otherwise to promote the sale, use -or other dealings in this Software without prior written authorization -of the copyright holder. - -All trademarks and registered trademarks mentioned herein are the -property of their respective owners. - -- - - - diff --git a/experimental/segmenter_lstm/README.md b/experimental/segmenter_lstm/README.md deleted file mode 100644 index e854a3badd9..00000000000 --- a/experimental/segmenter_lstm/README.md +++ /dev/null @@ -1,7 +0,0 @@ -# icu_segmenter_lstm [![crates.io](https://img.shields.io/crates/v/icu_segmenter_lstm)](https://crates.io/crates/icu_segmenter_lstm) - - - -## More Information - -For more information on development, authorship, contributing etc. please visit [`ICU4X home page`](https://github.com/unicode-org/icu4x). diff --git a/experimental/segmenter_lstm/src/lib.rs b/experimental/segmenter_lstm/src/lib.rs deleted file mode 100644 index e0d68044b5d..00000000000 --- a/experimental/segmenter_lstm/src/lib.rs +++ /dev/null @@ -1,14 +0,0 @@ -// This file is part of ICU4X. For terms of use, please see the file -// called LICENSE at the top level of the ICU4X source tree -// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). - -#![cfg_attr(not(any(test, feature = "std")), no_std)] - -extern crate alloc; - -mod error; -pub mod lstm; -pub mod math_helper; -pub mod structs; - -pub use error::Error as LstmError; diff --git a/experimental/segmenter_lstm/tests/lstm_test.rs b/experimental/segmenter_lstm/tests/lstm_test.rs deleted file mode 100644 index e72b0ce4734..00000000000 --- a/experimental/segmenter_lstm/tests/lstm_test.rs +++ /dev/null @@ -1,94 +0,0 @@ -// This file is part of ICU4X. For terms of use, please see the file -// called LICENSE at the top level of the ICU4X source tree -// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). - -use icu_provider::DataPayload; -use icu_segmenter_lstm::lstm::Lstm; -use icu_segmenter_lstm::structs; - -use serde::{Deserialize, Serialize}; -use std::fs::File; -use std::io::BufReader; - -/// `TestCase` is a struct used to store a single test case. -/// Each test case has two attributs: `unseg` which denots the unsegmented line, and `true_bies` which indicates the Bies -/// sequence representing the true segmentation. -#[derive(PartialEq, Debug, Serialize, Deserialize)] -pub struct TestCase { - pub unseg: String, - pub expected_bies: String, - pub true_bies: String, -} - -/// `TestTextData` is a struct to store a vector of `TestCase` that represents a test text. -#[derive(PartialEq, Debug, Serialize, Deserialize)] -pub struct TestTextData { - pub testcases: Vec, -} - -#[derive(Debug)] -pub struct TestText { - pub data: TestTextData, -} - -impl TestText { - pub fn new(data: TestTextData) -> Self { - Self { data } - } -} - -fn load_lstm_data(filename: &str) -> DataPayload { - DataPayload::::try_from_rc_buffer_badly( - std::fs::read(filename) - .expect("File can read to end") - .into(), - |bytes| serde_json::from_slice(bytes), - ) - .expect("JSON syntax error") -} - -fn load_test_text(filename: &str) -> TestTextData { - let file = File::open(filename).expect("File should be present"); - let reader = BufReader::new(file); - serde_json::from_reader(reader).expect("JSON syntax error") -} - -#[test] -fn test_model_loading() { - let filename = "tests/testdata/Thai_graphclust_exclusive_model4_heavy/weights.json"; - let lstm_data = load_lstm_data(filename); - let lstm = Lstm::try_new(lstm_data).unwrap(); - assert_eq!( - lstm.get_model_name(), - String::from("Thai_graphclust_exclusive_model4_heavy") - ); -} - -#[test] -fn segment_file_by_lstm() { - // Choosing the embedding system. It can be "graphclust" or "codepoints". - let embedding: &str = "codepoints"; - let mut model_filename = "tests/testdata/Thai_".to_owned(); - model_filename.push_str(embedding); - model_filename.push_str("_exclusive_model4_heavy/weights.json"); - let lstm_data = load_lstm_data(&model_filename); - let lstm = Lstm::try_new(lstm_data).unwrap(); - - // Importing the test data - let mut test_text_filename = "tests/testdata/test_text_".to_owned(); - test_text_filename.push_str(embedding); - test_text_filename.push_str(".json"); - let test_text_data = load_test_text(&test_text_filename); - let test_text = TestText::new(test_text_data); - - // Testing - for test_case in test_text.data.testcases { - let lstm_output = lstm.word_segmenter(&test_case.unseg); - println!("Test case : {}", test_case.unseg); - println!("Expected bies : {}", test_case.expected_bies); - println!("Estimated bies : {}", lstm_output); - println!("True bies : {}", test_case.true_bies); - println!("****************************************************"); - assert_eq!(test_case.expected_bies, lstm_output); - } -}