Merge pull request #2 from marberts/generic-matrix

Added match_first, closing #1
marberts · Nov 22, 2024 · 9f3378b · 9f3378b
2 parents d95d3c5 + 3421558
commit 9f3378b
Show file tree

Hide file tree

Showing 7 changed files with 69 additions and 32 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: rsmatrix
 Title: Matrices for Repeat-Sales Price Indexes
-Version: 0.2.8.9001
+Version: 0.2.8.9002
 Authors@R: c(
     person(given = "Steve", family = "Martin", role = c("aut", "cre", "cph"),
            email = "marberts@protonmail.com",
@@ -23,5 +23,5 @@ URL: https://marberts.github.io/rsmatrix/, https://github.com/marberts/rsmatrix
 BugReports: https://github.com/marberts/rsmatrix/issues
 Config/testthat/edition: 3
 VignetteBuilder: knitr
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.2
 Roxygen: list(markdown = TRUE)
diff --git a/NEWS.md b/NEWS.md
@@ -2,6 +2,9 @@
 
 - Updated maintainer email.
 
+- `rs_pairs()` gets a new argument `match_first` to control if products in the
+first period match to themselves (#1).
+
 ## Changes in version 0.2.8
 
 - Added a vignette.
@@ -14,17 +17,20 @@
 
 ## Changes in version 0.2.3
 
-- Making the "Y" vector with rs_matrix() no longer gives an error with length-0 inputs and a factor with non-empty levels.
+- Making the `"Y"` vector with `rs_matrix()` no longer gives an error with
+length-0 inputs and a factor with non-empty levels.
 
-- rs_matrix() cleans up the enclosing environment of its result.
+- `rs_matrix()` cleans up the enclosing environment of its result.
 
 ## Changes in version 0.2.1
 
-- rs_pairs() and rs_matrix() are now faster, and less picky about their inputs for time periods.
+- `rs_pairs()` and `rs_matrix()` are now faster, and less picky about their
+inputs for time periods.
 
 ## Changes in version 0.2.0
 
-- rs_pairs() has been reworked to be much faster and more general, while rs_unpair() has been removed. These changes are not backwards compatible.
+- `rs_pairs()` has been reworked to be much faster and more general,
+while `rs_unpair()` has been removed. These changes are not backwards compatible.
 
 - Added French translations.
 

diff --git a/R/rs_matrix.R b/R/rs_matrix.R
@@ -8,10 +8,10 @@ different_lengths <- function(...) {
 #' Compute the Z matrix (internal)
 #' @noRd
 rs_z_ <- function(t2, t1, f = NULL, sparse = FALSE) {
-  # coerce t2 and t1 into characters prior to taking the union
-  # so that both dates and factors are treated the same
-  lev2 <- as.character(unique(t2))
-  lev1 <- as.character(unique(t1))
+  # t2 and t1 are coerced into characters prior to taking the union
+  # so that both dates and factors are treated the same.
+  lev2 <- unique(as.character(t2))
+  lev1 <- unique(as.character(t1))
   lev <- sort.int(unique(c(lev2, lev1))) # usually faster than base::union()
   t2 <- factor(t2, lev)
   t1 <- factor(t1, lev)
@@ -22,7 +22,7 @@ rs_z_ <- function(t2, t1, f = NULL, sparse = FALSE) {
     )
   }
 
-  # make row names before interacting with f
+  # Make row names before interacting with f.
   nm <- if (!is.null(names(t2))) {
     names(t2)
   } else if (!is.null(names(t1))) {
@@ -40,7 +40,7 @@ rs_z_ <- function(t2, t1, f = NULL, sparse = FALSE) {
     lev <- levels(t2)
   }
 
-  # calculate Z
+  # Calculate Z.
   dims <- c(length(nm), length(lev))
   attributes(t2) <- NULL
   attributes(t1) <- NULL
@@ -70,7 +70,9 @@ rs_z_ <- function(t2, t1, f = NULL, sparse = FALSE) {
 
 #' Compute X matrix (internal)
 #' @noRd
-rs_x_ <- function(z, p2, p1) (z > 0) * p2 - (z < 0) * p1
+rs_x_ <- function(z, p2, p1) {
+  (z > 0) * p2 - (z < 0) * p1
+} 
 
 #' Shiller's repeat-sales matrices
 #'
@@ -165,6 +167,11 @@ rs_x_ <- function(z, p2, p1) (z > 0) * p2 - (z < 0) * p1
 #'
 #' @export
 rs_matrix <- function(t2, t1, p2, p1, f = NULL, sparse = FALSE) {
+  t2 <- as.character(t2)
+  t1 <- as.character(t1)
+  p2 <- as.numeric(p2)
+  p1 <- as.numeric(p1)
+
   if (is.null(f)) {
     if (different_lengths(t2, t1, p2, p1)) {
       stop("'t2', 't1', 'p2', and 'p1' must be the same length")
@@ -181,23 +188,23 @@ rs_matrix <- function(t2, t1, p2, p1, f = NULL, sparse = FALSE) {
       stop("'t2', 't1', and 'f' cannot contain NAs")
     }
   }
+
   z <- rs_z_(t2, t1, f, sparse)
-  p2 <- as.numeric(p2)
-  p1 <- as.numeric(p1)
-  # number of columns that need to be removed for base period
+  # Number of columns that need to be removed for base period.
   n <- max(1L, nlevels(f)) * (ncol(z) > 0)
-  # return value
+
   res <- function(matrix = c("Z", "X", "y", "Y")) {
     switch(match.arg(matrix),
       Z = z[, -seq_len(n), drop = FALSE],
       X = rs_x_(z[, -seq_len(n), drop = FALSE], p2, p1),
       y = structure(log(p2 / p1), names = rownames(z)),
       # rowSums() gets the single value in the base period
-      # for each group
+      # for each group.
       Y = -Matrix::rowSums(rs_x_(z[, seq_len(n), drop = FALSE], p2, p1))
     )
   }
-  # clean up enclosing environment
+
+  # Clean up enclosing environment.
   enc <- list(z = z, n = n, p2 = p2, p1 = p1)
   environment(res) <- list2env(enc, parent = getNamespace("rsmatrix"))
   res

diff --git a/R/rs_pairs.R b/R/rs_pairs.R
@@ -9,12 +9,14 @@
 #' [order()]).
 #' @param product A vector that gives the product identifier for each sale.
 #' Usually a factor or vector of integer codes for each product.
+#' @param match_first Should products in the first period match with
+#' themselves (the default)?
 #'
 #' @returns
 #' A numeric vector of indices giving the position of the previous sale
 #' for each `product`, with the convention that the previous sale for the
-#' first sale is itself. Ties are resolved according to the order they
-#' appear in `period`.
+#' first sale is itself if `match_first = TRUE`, `NA` otherwise. Ties are
+#' resolved according to the order they appear in `period`.
 #'
 #' @note
 #' [`order()`] is the workhorse of `rs_pairs()`, so performance can be
@@ -42,12 +44,12 @@
 #' x
 #'
 #' @export
-rs_pairs <- function(period, product) {
+rs_pairs <- function(period, product, match_first = TRUE) {
   if (length(product) != length(period)) {
     stop("'period' and 'product' must be the same length")
   }
 
-  # != is slow for factors with many levels, so use the integer codes
+  # != is slow for factors with many levels, so use the integer codes.
   if (is.factor(product)) {
     attributes(product) <- NULL
   }
@@ -58,11 +60,15 @@ rs_pairs <- function(period, product) {
   }
 
   res <- rep.int(NA_integer_, length(period))
-  # offset the period by product ordering
+  # Offset the period by product ordering.
   res[ord] <- ord[c(1L, seq_len(length(ord) - 1L))]
-  # the first period for each product points to the last period
-  # for the previous product
+  # The first period for each product points to the last period
+  # for the previous product.
   first <- which(product[res] != product)
   res[first] <- first
+
+  if (!match_first) {
+    res[period[res] == period] <- NA
+  }
   res
 }
diff --git a/R/rs_var.R b/R/rs_var.R
@@ -78,14 +78,14 @@ rs_var <- function(u, Z, X = Z, ids = seq_len(nrow(X)), df = NULL) {
   } else {
     as.numeric(df)
   }
-  # the meat
+  # The meat.
   ug <- split.data.frame(u, ids)
   Zg <- split.data.frame(Z, ids)
   V <- Map(function(x, y) tcrossprod(crossprod(x, y)), Zg, ug)
   V <- Reduce(`+`, V)
-  # the bread
+  # The bread.
   B <- solve(crossprod(Z, X))
-  # put the sandwich together
+  # Put the sandwich together.
   vcov <- tcrossprod(B %*% V, B)
   df * vcov
 }
diff --git a/man/rs_pairs.Rd b/man/rs_pairs.Rd
diff --git a/tests/testthat/test-rs_pairs.R b/tests/testthat/test-rs_pairs.R
@@ -20,6 +20,11 @@ test_that("an easy example works", {
     rs_pairs(x, y),
     c(1L, 10L, 3L, 3L, 4L, 7L, 1L, 8L, 2L, 10L, 8L)
   )
+
+  expect_equal(
+    rs_pairs(x, y, match_first = FALSE),
+    c(NA, 10, NA, 3, 4, 7, 1, NA, 2, NA, NA)
+  )
 
   x[2] <- NA
   expect_identical(
@@ -53,9 +58,14 @@ test_that("a more complex example works", {
 
 test_that("corner cases work", {
   expect_identical(rs_pairs(numeric(0), character(0)), integer(0))
+  expect_identical(
+    rs_pairs(numeric(0), character(0), match_first = FALSE),
+    integer(0)
+  )
   expect_identical(rs_pairs(1:4, rep(NA, 4)), integer(0))
   expect_identical(rs_pairs(rep(NA, 4), 1:4), integer(0))
   expect_identical(rs_pairs(1, 1), 1L)
+  expect_identical(rs_pairs(1, 1, match_first = FALSE), NA_integer_)
   expect_identical(rs_pairs(rep(1, 10), 1:10), 1:10)
   expect_identical(rs_pairs(1:10, rep(1, 10)), c(1L, 1:9))
   expect_identical(rs_pairs(c(1, 2, 3, 2), rep(1, 4)), c(1L, 1L, 4L, 2L))
@@ -68,6 +78,11 @@ test_that("sales pairs are back periods", {
     rs_pairs(period, product),
     c(11, 4, 10, 9, 8, 6, 1, 8, 9, 5, 6, 2)
   )
+
+  expect_equal(
+    rs_pairs(period, product, match_first = FALSE),
+    c(11, 4, 10, 9, 8, NA, 1, NA, NA, 5, 6, 2)
+  )
 })
 
 test_that("different length inputs is an error", {