Skip to content

Commit

Permalink
support categorical feature converters (CTR and count)
Browse files Browse the repository at this point in the history
  • Loading branch information
shiyu1994 committed Sep 7, 2020
1 parent f5f27ca commit 3d3d899
Show file tree
Hide file tree
Showing 191 changed files with 9,372 additions and 2,255 deletions.
2 changes: 1 addition & 1 deletion .appveyor.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
version: 2.3.2.{build}
version: 3.0.0.99.{build}

image: Visual Studio 2015
platform: x64
Expand Down
6 changes: 0 additions & 6 deletions .ci/setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,6 @@ if [[ $OS_NAME == "macos" ]]; then
else # gcc
if [[ $TASK != "mpi" ]]; then
brew install gcc
if [[ $GITHUB_ACTIONS == "true" ]]; then
brew update
fi
if [[ $TRAVIS == "true" ]] || [[ $GITHUB_ACTIONS == "true" ]]; then
brew upgrade gcc
fi
fi
fi
if [[ $TASK == "mpi" ]]; then
Expand Down
112 changes: 84 additions & 28 deletions .ci/test_r_package.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,33 +7,29 @@ mkdir -p $R_LIB_PATH
export R_LIBS=$R_LIB_PATH
export PATH="$R_LIB_PATH/R/bin:$PATH"

# hack to get around this:
# https://stat.ethz.ch/pipermail/r-package-devel/2020q3/005930.html
export _R_CHECK_SYSTEM_CLOCK_=0

# Get details needed for installing R components
#
# NOTES:
# * Linux builds on Azure use a container and don't need these details
if ! { [[ $AZURE == "true" ]] && [[ $OS_NAME == "linux" ]]; }; then
R_MAJOR_VERSION=( ${R_VERSION//./ } )
if [[ "${R_MAJOR_VERSION}" == "3" ]]; then
export R_MAC_VERSION=3.6.3
export R_LINUX_VERSION="3.6.3-1bionic"
export R_APT_REPO="bionic-cran35/"
elif [[ "${R_MAJOR_VERSION}" == "4" ]]; then
export R_MAC_VERSION=4.0.2
export R_LINUX_VERSION="4.0.2-1.1804.0"
export R_APT_REPO="bionic-cran40/"
else
echo "Unrecognized R version: ${R_VERSION}"
exit -1
fi
R_MAJOR_VERSION=( ${R_VERSION//./ } )
if [[ "${R_MAJOR_VERSION}" == "3" ]]; then
export R_MAC_VERSION=3.6.3
export R_LINUX_VERSION="3.6.3-1bionic"
export R_APT_REPO="bionic-cran35/"
elif [[ "${R_MAJOR_VERSION}" == "4" ]]; then
export R_MAC_VERSION=4.0.2
export R_LINUX_VERSION="4.0.2-1.1804.0"
export R_APT_REPO="bionic-cran40/"
else
echo "Unrecognized R version: ${R_VERSION}"
exit -1
fi

# installing precompiled R for Ubuntu
# https://cran.r-project.org/bin/linux/ubuntu/#installation
# adding steps from https://stackoverflow.com/a/56378217/3986677 to get latest version
#
# This only needs to get run on Travis because R environment for Linux
# used by Azure pipelines is set up in https://github.com/guolinke/lightgbm-ci-docker
if [[ $AZURE != "true" ]] && [[ $OS_NAME == "linux" ]]; then
if [[ $OS_NAME == "linux" ]]; then
sudo apt-key adv \
--keyserver keyserver.ubuntu.com \
--recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9
Expand All @@ -42,18 +38,33 @@ if [[ $AZURE != "true" ]] && [[ $OS_NAME == "linux" ]]; then
sudo apt-get update
sudo apt-get install \
--no-install-recommends \
-y \
-y --allow-downgrades \
r-base-dev=${R_LINUX_VERSION} \
texinfo \
texlive-latex-recommended \
texlive-fonts-recommended \
texlive-fonts-extra \
qpdf \
|| exit -1

# https://github.com/r-lib/actions/issues/111
if [[ $R_BUILD_TYPE == "cran" ]]; then
sudo apt-get install \
--no-install-recommends \
-y \
autoconf=$(cat R-package/AUTOCONF_UBUNTU_VERSION) \
devscripts \
|| exit -1
fi
fi

# Installing R precompiled for Mac OS 10.11 or higher
if [[ $OS_NAME == "macos" ]]; then
if [[ $R_BUILD_TYPE == "cran" ]]; then
brew install \
automake \
checkbashisms
fi
brew install qpdf
brew cask install basictex
export PATH="/Library/TeX/texbin:$PATH"
Expand Down Expand Up @@ -109,20 +120,50 @@ if [[ $TASK == "r-package-check-docs" ]]; then
fi

cd ${BUILD_DIRECTORY}
Rscript build_r.R --skip-install || exit -1

PKG_TARBALL="lightgbm_${LGB_VER}.tar.gz"
PKG_TARBALL="lightgbm_*.tar.gz"
LOG_FILE_NAME="lightgbm.Rcheck/00check.log"
if [[ $R_BUILD_TYPE == "cmake" ]]; then
Rscript build_r.R --skip-install || exit -1
elif [[ $R_BUILD_TYPE == "cran" ]]; then

# on Linux, we recreate configure in CI to test if
# a change in a PR has changed configure.ac
if [[ $OS_NAME == "linux" ]]; then
${BUILD_DIRECTORY}/R-package/recreate-configure.sh

num_files_changed=$(
git diff --name-only | wc -l
)
if [[ ${num_files_changed} -gt 0 ]]; then
echo "'configure' in the R package has changed. Please recreate it and commit the changes."
echo "Changed files:"
git diff --compact-summary
echo "See R-package/README.md for details on how to recreate this script."
echo ""
exit -1
fi
fi

# suppress R CMD check warning from Suggests dependencies not being available
export _R_CHECK_FORCE_SUGGESTS_=0
./build-cran-package.sh || exit -1

# Test CRAN source .tar.gz in a directory that is not this repo or below it.
# When people install.packages('lightgbm'), they won't have the LightGBM
# git repo around. This is to protect against the use of relative paths
# like ../../CMakeLists.txt that would only work if you are in the repo
R_CMD_CHECK_DIR="${HOME}/tmp-r-cmd-check/"
mkdir -p ${R_CMD_CHECK_DIR}
mv ${PKG_TARBALL} ${R_CMD_CHECK_DIR}
cd ${R_CMD_CHECK_DIR}
fi

# fails tests if either ERRORs or WARNINGs are thrown by
# R CMD CHECK
check_succeeded="yes"
(
R CMD check ${PKG_TARBALL} \
--as-cran \
--run-dontrun \
|| check_succeeded="no"
) &

Expand All @@ -137,7 +178,8 @@ while kill -0 ${CHECK_PID} >/dev/null 2>&1; do
done

echo "R CMD check build logs:"
cat ${BUILD_DIRECTORY}/lightgbm.Rcheck/00install.out
BUILD_LOG_FILE=lightgbm.Rcheck/00install.out
cat ${BUILD_LOG_FILE}

if [[ $check_succeeded == "no" ]]; then
exit -1
Expand All @@ -148,7 +190,7 @@ if grep -q -R "WARNING" "$LOG_FILE_NAME"; then
exit -1
fi

ALLOWED_CHECK_NOTES=3
ALLOWED_CHECK_NOTES=2
NUM_CHECK_NOTES=$(
cat ${LOG_FILE_NAME} \
| grep -e '^Status: .* NOTE.*' \
Expand All @@ -158,3 +200,17 @@ if [[ ${NUM_CHECK_NOTES} -gt ${ALLOWED_CHECK_NOTES} ]]; then
echo "Found ${NUM_CHECK_NOTES} NOTEs from R CMD check. Only ${ALLOWED_CHECK_NOTES} are allowed"
exit -1
fi

# this check makes sure that CI builds of the CRAN package on Mac
# actually use OpenMP
if [[ $OS_NAME == "macos" ]] && [[ $R_BUILD_TYPE == "cran" ]]; then
omp_working=$(
cat $BUILD_LOG_FILE \
| grep -E "checking whether OpenMP will work .*yes" \
| wc -l
)
if [[ $omp_working -ne 1 ]]; then
echo "OpenMP was not found, and should be when testing the CRAN package on macOS"
exit -1
fi
fi
119 changes: 75 additions & 44 deletions .ci/test_r_package_windows.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ function Download-File-With-Retries {
do {
Write-Output "Downloading ${url}"
sleep 5;
(New-Object System.Net.WebClient).DownloadFile($url, $destfile)
Invoke-WebRequest -Uri $url -OutFile $destfile
} while(!$?);
}

Expand Down Expand Up @@ -44,33 +44,41 @@ function Run-R-Code-Redirect-Stderr {
Rscript --vanilla -e $decorated_code
}

$env:R_LIB_PATH = "$env:BUILD_SOURCESDIRECTORY/RLibrary" -replace '[\\]', '/'
$env:R_LIBS = "$env:R_LIB_PATH"
$env:PATH = "$env:R_LIB_PATH/Rtools/bin;" + "$env:R_LIB_PATH/Rtools/usr/bin;" + "$env:R_LIB_PATH/R/bin/x64;" + "$env:R_LIB_PATH/miktex/texmfs/install/miktex/bin/x64;" + $env:PATH
$env:CRAN_MIRROR = "https://cloud.r-project.org/"
$env:CTAN_MIRROR = "https://ctan.math.illinois.edu/systems/win32/miktex"
$env:CTAN_MIKTEX_ARCHIVE = "$env:CTAN_MIRROR/setup/windows-x64/"
$env:CTAN_PACKAGE_ARCHIVE = "$env:CTAN_MIRROR/tm/packages/"

# Get details needed for installing R components
#
# NOTES:
# * some paths and file names are different on R4.0
$env:R_MAJOR_VERSION = $env:R_VERSION.split('.')[0]
if ($env:R_MAJOR_VERSION -eq "3") {
$env:RTOOLS_MINGW_BIN = "$env:R_LIB_PATH/Rtools/mingw_64/bin"
# Rtools 3.x has to be installed at C:\Rtools\
# * https://stackoverflow.com/a/46619260/3986677
$RTOOLS_INSTALL_PATH = "C:\Rtools"
$env:RTOOLS_MINGW_BIN = "$RTOOLS_INSTALL_PATH/mingw_64/bin"
$env:RTOOLS_EXE_FILE = "Rtools35.exe"
$env:R_WINDOWS_VERSION = "3.6.3"
} elseif ($env:R_MAJOR_VERSION -eq "4") {
$env:RTOOLS_MINGW_BIN = "$env:R_LIB_PATH/Rtools/mingw64/bin"
$RTOOLS_INSTALL_PATH = "C:\rtools40"
$env:RTOOLS_MINGW_BIN = "$RTOOLS_INSTALL_PATH/mingw64/bin"
$env:RTOOLS_EXE_FILE = "rtools40-x86_64.exe"
$env:R_WINDOWS_VERSION = "4.0.2"
} else {
Write-Output "[ERROR] Unrecognized R version: $env:R_VERSION"
Check-Output $false
}

if ($env:COMPILER -eq "MINGW") {
$env:R_LIB_PATH = "$env:BUILD_SOURCESDIRECTORY/RLibrary" -replace '[\\]', '/'
$env:R_LIBS = "$env:R_LIB_PATH"
$env:PATH = "$RTOOLS_INSTALL_PATH/bin;" + "$RTOOLS_INSTALL_PATH/usr/bin;" + "$env:R_LIB_PATH/R/bin/x64;" + "$env:R_LIB_PATH/miktex/texmfs/install/miktex/bin/x64;" + $env:PATH
$env:CRAN_MIRROR = "https://cloud.r-project.org/"
$env:CTAN_MIRROR = "https://ctan.math.illinois.edu/systems/win32/miktex"
$env:CTAN_MIKTEX_ARCHIVE = "$env:CTAN_MIRROR/setup/windows-x64/"
$env:CTAN_PACKAGE_ARCHIVE = "$env:CTAN_MIRROR/tm/packages/"

# hack to get around this:
# https://stat.ethz.ch/pipermail/r-package-devel/2020q3/005930.html
$env:_R_CHECK_SYSTEM_CLOCK_ = 0

if (($env:COMPILER -eq "MINGW") -and ($env:R_BUILD_TYPE -eq "cmake")) {
$env:CXX = "$env:RTOOLS_MINGW_BIN/g++.exe"
$env:CC = "$env:RTOOLS_MINGW_BIN/gcc.exe"
}
Expand All @@ -79,42 +87,46 @@ cd $env:BUILD_SOURCESDIRECTORY
tzutil /s "GMT Standard Time"
[Void][System.IO.Directory]::CreateDirectory($env:R_LIB_PATH)

if ($env:TOOLCHAIN -eq "MINGW") {
Write-Output "Telling R to use MinGW"
$install_libs = "$env:BUILD_SOURCESDIRECTORY/R-package/src/install.libs.R"
((Get-Content -Path $install_libs -Raw) -Replace 'use_mingw <- FALSE','use_mingw <- TRUE') | Set-Content -Path $install_libs
} elseif ($env:TOOLCHAIN -eq "MSYS") {
Write-Output "Telling R to use MSYS"
$install_libs = "$env:BUILD_SOURCESDIRECTORY/R-package/src/install.libs.R"
((Get-Content -Path $install_libs -Raw) -Replace 'use_msys2 <- FALSE','use_msys2 <- TRUE') | Set-Content -Path $install_libs
} elseif ($env:TOOLCHAIN -eq "MSVC") {
# no customization for MSVC
} else {
Write-Output "[ERROR] Unrecognized compiler: $env:TOOLCHAIN"
Check-Output $false
if ($env:R_BUILD_TYPE -eq "cmake") {
if ($env:TOOLCHAIN -eq "MINGW") {
Write-Output "Telling R to use MinGW"
$install_libs = "$env:BUILD_SOURCESDIRECTORY/R-package/src/install.libs.R"
((Get-Content -Path $install_libs -Raw) -Replace 'use_mingw <- FALSE','use_mingw <- TRUE') | Set-Content -Path $install_libs
} elseif ($env:TOOLCHAIN -eq "MSYS") {
Write-Output "Telling R to use MSYS"
$install_libs = "$env:BUILD_SOURCESDIRECTORY/R-package/src/install.libs.R"
((Get-Content -Path $install_libs -Raw) -Replace 'use_msys2 <- FALSE','use_msys2 <- TRUE') | Set-Content -Path $install_libs
} elseif ($env:TOOLCHAIN -eq "MSVC") {
# no customization for MSVC
} else {
Write-Output "[ERROR] Unrecognized toolchain: $env:TOOLCHAIN"
Check-Output $false
}
}

# download R and RTools
Write-Output "Downloading R and Rtools"
Download-File-With-Retries -url "https://cloud.r-project.org/bin/windows/base/old/$env:R_WINDOWS_VERSION/R-$env:R_WINDOWS_VERSION-win.exe" -destfile "R-win.exe"
Download-File-With-Retries -url "https://cloud.r-project.org/bin/windows/Rtools/$env:RTOOLS_EXE_FILE" -destfile "Rtools.exe"
Download-File-With-Retries -url "https://cran.r-project.org/bin/windows/base/old/$env:R_WINDOWS_VERSION/R-$env:R_WINDOWS_VERSION-win.exe" -destfile "R-win.exe"
Download-File-With-Retries -url "https://cran.r-project.org/bin/windows/Rtools/$env:RTOOLS_EXE_FILE" -destfile "Rtools.exe"

# Install R
Write-Output "Installing R"
Start-Process -FilePath R-win.exe -NoNewWindow -Wait -ArgumentList "/VERYSILENT /DIR=$env:R_LIB_PATH/R /COMPONENTS=main,x64" ; Check-Output $?
Write-Output "Done installing R"

Write-Output "Installing Rtools"
Start-Process -FilePath Rtools.exe -NoNewWindow -Wait -ArgumentList "/VERYSILENT /DIR=$env:R_LIB_PATH/Rtools" ; Check-Output $?
./Rtools.exe /VERYSILENT /SUPPRESSMSGBOXES /DIR=$RTOOLS_INSTALL_PATH ; Check-Output $?
Write-Output "Done installing Rtools"

Write-Output "Installing dependencies"
$packages = "c('data.table', 'jsonlite', 'Matrix', 'processx', 'R6', 'testthat'), dependencies = c('Imports', 'Depends', 'LinkingTo')"
Run-R-Code-Redirect-Stderr "options(install.packages.check.source = 'no'); install.packages($packages, repos = '$env:CRAN_MIRROR', type = 'binary', lib = '$env:R_LIB_PATH')" ; Check-Output $?

# MiKTeX and pandoc can be skipped on MSVC builds, since we don't
# build the package documentation for those
if ($env:COMPILER -ne "MSVC") {
# MiKTeX and pandoc can be skipped on non-MinGW builds, since we don't
# build the package documentation for those.
#
# MiKTeX always needs to be built to test a CRAN package.
if (($env:COMPILER -eq "MINGW") -or ($env:R_BUILD_TYPE -eq "cran")) {
Download-Miktex-Setup "$env:CTAN_MIKTEX_ARCHIVE" "miktexsetup-x64.zip"
Add-Type -AssemblyName System.IO.Compression.FileSystem
[System.IO.Compression.ZipFile]::ExtractToDirectory("miktexsetup-x64.zip", "miktex")
Expand All @@ -132,18 +144,35 @@ Write-Output "Building R package"

# R CMD check is not used for MSVC builds
if ($env:COMPILER -ne "MSVC") {
Run-R-Code-Redirect-Stderr "commandArgs <- function(...){'--skip-install'}; source('build_r.R')"; Check-Output $?

$PKG_FILE_NAME = Get-Item *.tar.gz
$PKG_FILE_NAME = $PKG_FILE_NAME -replace '[\\]', '/'
$PKG_FILE_NAME = "lightgbm_*.tar.gz"
$LOG_FILE_NAME = "lightgbm.Rcheck/00check.log"

$env:_R_CHECK_FORCE_SUGGESTS_ = 0
Write-Output "Running R CMD check as CRAN"
Run-R-Code-Redirect-Stderr "result <- processx::run(command = 'R.exe', args = c('CMD', 'check', '--no-multiarch', '--as-cran', '$PKG_FILE_NAME'), echo = TRUE, windows_verbatim_args = FALSE)" ; $check_succeeded = $?
if ($env:R_BUILD_TYPE -eq "cmake") {
Run-R-Code-Redirect-Stderr "commandArgs <- function(...){'--skip-install'}; source('build_r.R')"; Check-Output $?
} elseif ($env:R_BUILD_TYPE -eq "cran") {
Run-R-Code-Redirect-Stderr "result <- processx::run(command = 'sh', args = 'build-cran-package.sh', echo = TRUE, windows_verbatim_args = FALSE)" ; Check-Output $?
# Test CRAN source .tar.gz in a directory that is not this repo or below it.
# When people install.packages('lightgbm'), they won't have the LightGBM
# git repo around. This is to protect against the use of relative paths
# like ../../CMakeLists.txt that would only work if you are in the repoo
$R_CMD_CHECK_DIR = "tmp-r-cmd-check"
New-Item -Path "C:\" -Name $R_CMD_CHECK_DIR -ItemType "directory" > $null
Move-Item -Path "$PKG_FILE_NAME" -Destination "C:\$R_CMD_CHECK_DIR\" > $null
cd "C:\$R_CMD_CHECK_DIR\"
}

Write-Output "Running R CMD check"
if ($env:R_BUILD_TYPE -eq "cran") {
# CRAN packages must pass without --no-multiarch (build on 64-bit and 32-bit)
$check_args = "c('CMD', 'check', '--as-cran', '--run-dontrun', '$PKG_FILE_NAME')"
} else {
$check_args = "c('CMD', 'check', '--no-multiarch', '--as-cran', '--run-dontrun', '$PKG_FILE_NAME')"
}
Run-R-Code-Redirect-Stderr "result <- processx::run(command = 'R.exe', args = $check_args, echo = TRUE, windows_verbatim_args = FALSE)" ; $check_succeeded = $?

Write-Output "R CMD check build logs:"
$INSTALL_LOG_FILE_NAME = "$env:BUILD_SOURCESDIRECTORY\lightgbm.Rcheck\00install.out"
$INSTALL_LOG_FILE_NAME = "lightgbm.Rcheck\00install.out"
Get-Content -Path "$INSTALL_LOG_FILE_NAME"

Check-Output $check_succeeded
Expand All @@ -157,7 +186,7 @@ if ($env:COMPILER -ne "MSVC") {
$note_str = Get-Content -Path "${LOG_FILE_NAME}" | Select-String -Pattern '.*Status.* NOTE' | Out-String ; Check-Output $?
$relevant_line = $note_str -match '(\d+) NOTE'
$NUM_CHECK_NOTES = $matches[1]
$ALLOWED_CHECK_NOTES = 3
$ALLOWED_CHECK_NOTES = 2
if ([int]$NUM_CHECK_NOTES -gt $ALLOWED_CHECK_NOTES) {
Write-Output "Found ${NUM_CHECK_NOTES} NOTEs from R CMD check. Only ${ALLOWED_CHECK_NOTES} are allowed"
Check-Output $False
Expand All @@ -175,15 +204,17 @@ if ($env:COMPILER -ne "MSVC") {
# Checking that we actually got the expected compiler. The R package has some logic
# to fail back to MinGW if MSVC fails, but for CI builds we need to check that the correct
# compiler was used.
$checks = Select-String -Path "${INSTALL_LOG_FILE_NAME}" -Pattern "Check for working CXX compiler.*$env:COMPILER"
if ($checks.Matches.length -eq 0) {
Write-Output "The wrong compiler was used. Check the build logs."
Check-Output $False
if ($env:R_BUILD_TYPE -eq "cmake") {
$checks = Select-String -Path "${INSTALL_LOG_FILE_NAME}" -Pattern "Check for working CXX compiler.*$env:COMPILER"
if ($checks.Matches.length -eq 0) {
Write-Output "The wrong compiler was used. Check the build logs."
Check-Output $False
}
}

# Checking that we got the right toolchain for MinGW. If using MinGW, both
# MinGW and MSYS toolchains are supported
if ($env:COMPILER -eq "MINGW") {
if (($env:COMPILER -eq "MINGW") -and ($env:R_BUILD_TYPE -eq "cmake")) {
$checks = Select-String -Path "${INSTALL_LOG_FILE_NAME}" -Pattern "Trying to build with.*$env:TOOLCHAIN"
if ($checks.Matches.length -eq 0) {
Write-Output "The wrong toolchain was used. Check the build logs."
Expand Down
Loading

0 comments on commit 3d3d899

Please sign in to comment.