Skip to content

Commit

Permalink
fix(tabby): remove redundant empty string tokens participating in rag…
Browse files Browse the repository at this point in the history
… search from token lists (#1327)

* Remove redundant empty string tokens participating in rag search from token list.

* Add unit test and comment

* Update crates/tabby/src/services/completion/completion_prompt.rs

---------

Co-authored-by: yzn <yaozhennan@secidea.com>
Co-authored-by: Meng Zhang <meng@tabbyml.com>
  • Loading branch information
3 people authored Jan 30, 2024
1 parent 1047ef4 commit ea240ac
Showing 1 changed file with 37 additions and 1 deletion.
38 changes: 37 additions & 1 deletion crates/tabby/src/services/completion/completion_prompt.rs
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,11 @@ lazy_static! {
}

fn tokenize_text(text: &str) -> Vec<String> {
TOKENIZER.split(text).map(|x| x.to_owned()).collect()
TOKENIZER
.split(text)
.map(|x| x.to_owned())
.filter(|x| !x.is_empty())
.collect()
}

#[cfg(test)]
Expand Down Expand Up @@ -368,4 +372,36 @@ def this_is_prefix():\n";
expected_built_prefix
);
}

/// Empty strings tokens are not participating rag search and therefore could be removed.
#[test]
fn test_tokenized_text_filter() {
let prefix = r#"public static String getFileExtension(String fullName) {
String fileName = (new File(fullName)).getName();
int dotIndex = fileName.lastIndexOf('.');
}"#;

// with filter
assert_eq!(
tokenize_text(prefix),
[
"public",
"static",
"String",
"getFileExtension",
"String",
"fullName",
"String",
"fileName",
"new",
"File",
"fullName",
"getName",
"int",
"dotIndex",
"fileName",
"lastIndexOf",
]
);
}
}

0 comments on commit ea240ac

Please sign in to comment.