Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Commit

Permalink
Merge branch 'f-#1-all-utf8'. Fixes #1.
Browse files Browse the repository at this point in the history
- New `encoding()`, returns `"ASCII"` for pure ASCII strings and behaves identical to `base::Encoding()` otherwise.
- New `all_utf8()`, returns an atomic logical that indicates if all elements of a character vector are UTF-8 encoded; this includes pure ASCII strings.
  • Loading branch information
krlmlr committed Aug 8, 2016
2 parents 910b039 + dc1d64f commit cdeaa90
Show file tree
Hide file tree
Showing 9 changed files with 164 additions and 0 deletions.
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ S3method(as.utf8,utf8)
S3method(c,utf8)
S3method(format,utf8)
S3method(print,utf8)
export(all_utf8)
export(as.utf8)
export(encoding)
export(is.utf8)
export(utf8)
useDynLib(utf8, .registration = TRUE, .fixes = "C_")
12 changes: 12 additions & 0 deletions R/utf8-package.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#' @useDynLib utf8, .registration = TRUE, .fixes = "C_"
"_PACKAGE"

#' @export
encoding <- function(x) {
.Call(C_encoding, x)
}

#' @export
all_utf8 <- function(x) {
.Call(C_all_utf8, x)
}
11 changes: 11 additions & 0 deletions man/utf8-package.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions src/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
*.o
*.so
*.dll
62 changes: 62 additions & 0 deletions src/encoding.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#include <sys/time.h>
#include <sys/resource.h>

#define USE_RINTERNALS
#include <R.h>
#include <Rinternals.h>
#include <R_ext/RS.h>

#define BYTES_MASK (1<<1)
#define LATIN1_MASK (1<<2)
#define UTF8_MASK (1<<3)
#define CACHED_MASK (1<<5)
#define ASCII_MASK (1<<6)

# define IS_BYTES(x) ((x)->sxpinfo.gp & BYTES_MASK)
# define IS_LATIN1(x) ((x)->sxpinfo.gp & LATIN1_MASK)
# define IS_ASCII(x) ((x)->sxpinfo.gp & ASCII_MASK)
# define IS_UTF8(x) ((x)->sxpinfo.gp & UTF8_MASK)
# define ENC_KNOWN(x) ((x)->sxpinfo.gp & (LATIN1_MASK | UTF8_MASK))

SEXP encoding(SEXP x)
{
if (TYPEOF(x) != STRSXP)
error("a character vector argument expected");

R_xlen_t n = XLENGTH(x);
SEXP ans;
PROTECT(ans = allocVector(STRSXP, n));
for (R_xlen_t i = 0; i < n; i++) {
char *tmp;
SEXP xi = STRING_ELT(x, i);
if(IS_BYTES(xi)) tmp = "bytes";
else if(IS_LATIN1(xi)) tmp = "latin1";
else if(IS_UTF8(xi)) tmp = "UTF-8";
else if(IS_ASCII(xi)) tmp = "ASCII";
else tmp = "unknown";
SET_STRING_ELT(ans, i, mkChar(tmp));
}
UNPROTECT(1);
return ans;
}

SEXP all_utf8(SEXP x)
{
if (TYPEOF(x) != STRSXP)
error("a character vector argument expected");

R_xlen_t n = XLENGTH(x);
SEXP ans;
PROTECT(ans = allocVector(LGLSXP, 1));
LOGICAL(ans)[0] = TRUE;
for (R_xlen_t i = 0; i < n; i++) {
SEXP xi = STRING_ELT(x, i);
if(IS_UTF8(xi) || IS_ASCII(xi))
continue;

LOGICAL(ans)[0] = FALSE;
break;
}
UNPROTECT(1);
return ans;
}
2 changes: 2 additions & 0 deletions src/encoding.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
extern SEXP encoding(SEXP chars);
extern SEXP all_utf8(SEXP chars);
45 changes: 45 additions & 0 deletions src/init.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
/*
* R : A Computer Language for Statistical Data Analysis
* Copyright (C) 2012 The R Core Team.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, a copy is available at
* http://www.r-project.org/Licenses/
*/

#ifdef HAVE_CONFIG_H
#include <config.h>
#endif

#include <R.h>
#include <Rinternals.h>

#include "encoding.h"
#include <R_ext/Rdynload.h>


#define CALLDEF(name, n) {#name, (DL_FUNC) &name, n}

static const R_CallMethodDef CallEntries[] = {
CALLDEF(encoding, 1),
CALLDEF(all_utf8, 1),

{NULL, NULL, 0}
};

void R_init_utf8(DllInfo *dll)
{
R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
R_useDynamicSymbols(dll, FALSE);
R_forceSymbols(dll, TRUE);
}
4 changes: 4 additions & 0 deletions tests/testthat/helper-encoding.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
as_unknown <- function(x) {
Encoding(x) <- "unknown"
x
}
22 changes: 22 additions & 0 deletions tests/testthat/test-ascii.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
context("ascii")

test_that("ASCII encoding detected works", {
expect_equal(
encoding(c(
"a",
iconv("ä", to = "UTF-8"),
iconv("ä", to = "latin1"),
as_unknown("ä")
)),
c("ASCII", "UTF-8", "latin1", "unknown"))
})

test_that("all_utf8()", {
expect_true(all_utf8(character()))
expect_true(all_utf8("a"))
expect_true(all_utf8(iconv("ä", to = "UTF-8")))
expect_true(all_utf8(c("a", iconv("ä", to = "UTF-8"))))
expect_false(all_utf8(iconv("ä", to = "latin1")))
expect_false(all_utf8(c("a", iconv("ä", to = "latin1"))))
expect_false(all_utf8(c("a", as_unknown("ä"))))
})

0 comments on commit cdeaa90

Please sign in to comment.