This repository has been archived by the owner on Feb 18, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Browse files
Browse the repository at this point in the history
- New `encoding()`, returns `"ASCII"` for pure ASCII strings and behaves identical to `base::Encoding()` otherwise. - New `all_utf8()`, returns an atomic logical that indicates if all elements of a character vector are UTF-8 encoded; this includes pure ASCII strings.
- Loading branch information
Showing
9 changed files
with
164 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
#' @useDynLib utf8, .registration = TRUE, .fixes = "C_" | ||
"_PACKAGE" | ||
|
||
#' @export | ||
encoding <- function(x) { | ||
.Call(C_encoding, x) | ||
} | ||
|
||
#' @export | ||
all_utf8 <- function(x) { | ||
.Call(C_all_utf8, x) | ||
} |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
*.o | ||
*.so | ||
*.dll |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
#include <sys/time.h> | ||
#include <sys/resource.h> | ||
|
||
#define USE_RINTERNALS | ||
#include <R.h> | ||
#include <Rinternals.h> | ||
#include <R_ext/RS.h> | ||
|
||
#define BYTES_MASK (1<<1) | ||
#define LATIN1_MASK (1<<2) | ||
#define UTF8_MASK (1<<3) | ||
#define CACHED_MASK (1<<5) | ||
#define ASCII_MASK (1<<6) | ||
|
||
# define IS_BYTES(x) ((x)->sxpinfo.gp & BYTES_MASK) | ||
# define IS_LATIN1(x) ((x)->sxpinfo.gp & LATIN1_MASK) | ||
# define IS_ASCII(x) ((x)->sxpinfo.gp & ASCII_MASK) | ||
# define IS_UTF8(x) ((x)->sxpinfo.gp & UTF8_MASK) | ||
# define ENC_KNOWN(x) ((x)->sxpinfo.gp & (LATIN1_MASK | UTF8_MASK)) | ||
|
||
SEXP encoding(SEXP x) | ||
{ | ||
if (TYPEOF(x) != STRSXP) | ||
error("a character vector argument expected"); | ||
|
||
R_xlen_t n = XLENGTH(x); | ||
SEXP ans; | ||
PROTECT(ans = allocVector(STRSXP, n)); | ||
for (R_xlen_t i = 0; i < n; i++) { | ||
char *tmp; | ||
SEXP xi = STRING_ELT(x, i); | ||
if(IS_BYTES(xi)) tmp = "bytes"; | ||
else if(IS_LATIN1(xi)) tmp = "latin1"; | ||
else if(IS_UTF8(xi)) tmp = "UTF-8"; | ||
else if(IS_ASCII(xi)) tmp = "ASCII"; | ||
else tmp = "unknown"; | ||
SET_STRING_ELT(ans, i, mkChar(tmp)); | ||
} | ||
UNPROTECT(1); | ||
return ans; | ||
} | ||
|
||
SEXP all_utf8(SEXP x) | ||
{ | ||
if (TYPEOF(x) != STRSXP) | ||
error("a character vector argument expected"); | ||
|
||
R_xlen_t n = XLENGTH(x); | ||
SEXP ans; | ||
PROTECT(ans = allocVector(LGLSXP, 1)); | ||
LOGICAL(ans)[0] = TRUE; | ||
for (R_xlen_t i = 0; i < n; i++) { | ||
SEXP xi = STRING_ELT(x, i); | ||
if(IS_UTF8(xi) || IS_ASCII(xi)) | ||
continue; | ||
|
||
LOGICAL(ans)[0] = FALSE; | ||
break; | ||
} | ||
UNPROTECT(1); | ||
return ans; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
extern SEXP encoding(SEXP chars); | ||
extern SEXP all_utf8(SEXP chars); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
/* | ||
* R : A Computer Language for Statistical Data Analysis | ||
* Copyright (C) 2012 The R Core Team. | ||
* | ||
* This program is free software; you can redistribute it and/or modify | ||
* it under the terms of the GNU General Public License as published by | ||
* the Free Software Foundation; either version 2 of the License, or | ||
* (at your option) any later version. | ||
* | ||
* This program is distributed in the hope that it will be useful, | ||
* but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
* GNU General Public License for more details. | ||
* | ||
* You should have received a copy of the GNU General Public License | ||
* along with this program; if not, a copy is available at | ||
* http://www.r-project.org/Licenses/ | ||
*/ | ||
|
||
#ifdef HAVE_CONFIG_H | ||
#include <config.h> | ||
#endif | ||
|
||
#include <R.h> | ||
#include <Rinternals.h> | ||
|
||
#include "encoding.h" | ||
#include <R_ext/Rdynload.h> | ||
|
||
|
||
#define CALLDEF(name, n) {#name, (DL_FUNC) &name, n} | ||
|
||
static const R_CallMethodDef CallEntries[] = { | ||
CALLDEF(encoding, 1), | ||
CALLDEF(all_utf8, 1), | ||
|
||
{NULL, NULL, 0} | ||
}; | ||
|
||
void R_init_utf8(DllInfo *dll) | ||
{ | ||
R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); | ||
R_useDynamicSymbols(dll, FALSE); | ||
R_forceSymbols(dll, TRUE); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
as_unknown <- function(x) { | ||
Encoding(x) <- "unknown" | ||
x | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
context("ascii") | ||
|
||
test_that("ASCII encoding detected works", { | ||
expect_equal( | ||
encoding(c( | ||
"a", | ||
iconv("ä", to = "UTF-8"), | ||
iconv("ä", to = "latin1"), | ||
as_unknown("ä") | ||
)), | ||
c("ASCII", "UTF-8", "latin1", "unknown")) | ||
}) | ||
|
||
test_that("all_utf8()", { | ||
expect_true(all_utf8(character())) | ||
expect_true(all_utf8("a")) | ||
expect_true(all_utf8(iconv("ä", to = "UTF-8"))) | ||
expect_true(all_utf8(c("a", iconv("ä", to = "UTF-8")))) | ||
expect_false(all_utf8(iconv("ä", to = "latin1"))) | ||
expect_false(all_utf8(c("a", iconv("ä", to = "latin1")))) | ||
expect_false(all_utf8(c("a", as_unknown("ä")))) | ||
}) |