From eb42b01f687438953cb4bb450205ff23b402bc1c Mon Sep 17 00:00:00 2001 From: Chris Bartak Date: Wed, 28 Nov 2018 11:48:39 -0600 Subject: [PATCH 1/5] CLN: keyerror in versioneer error message --- pandas/_version.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/_version.py b/pandas/_version.py index 036c927df45d3..d000539421b91 100644 --- a/pandas/_version.py +++ b/pandas/_version.py @@ -238,14 +238,14 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): # tag full_tag = mo.group(1) if not full_tag.startswith(tag_prefix): + fmt = ("tag '{full_tag}' doesn't start with prefix " + "'{tag_prefix}'") + msg = fmt.format(full_tag=full_tag, tag_prefix=tag_prefix) if verbose: - fmt = "tag '{full_tag}' doesn't start with prefix " \ - "'{tag_prefix}'" - print(fmt.format(full_tag=full_tag, tag_prefix=tag_prefix)) - pieces["error"] = ("tag '{full_tag}' doesn't start with " - "prefix '{tag_prefix}'".format( - full_tag, tag_prefix)) + print(msg) + pieces["error"] = msg return pieces + pieces["closest-tag"] = full_tag[len(tag_prefix):] # distance: number of commits since tag From 6ef415532d180832219242ae623f7d4e1438a9d7 Mon Sep 17 00:00:00 2001 From: Chris Bartak Date: Wed, 28 Nov 2018 14:29:31 -0600 Subject: [PATCH 2/5] PERF: ascii ctype.h functions --- LICENSES/MUSL_LICENSE | 132 ++++++++++++++++++++++++++++ pandas/_libs/src/headers/portable.h | 6 ++ pandas/_libs/src/parse_helper.h | 6 +- pandas/_libs/src/parser/tokenizer.c | 56 ++++++------ 4 files changed, 170 insertions(+), 30 deletions(-) create mode 100644 LICENSES/MUSL_LICENSE diff --git a/LICENSES/MUSL_LICENSE b/LICENSES/MUSL_LICENSE new file mode 100644 index 0000000000000..a8833d4bc4744 --- /dev/null +++ b/LICENSES/MUSL_LICENSE @@ -0,0 +1,132 @@ +musl as a whole is licensed under the following standard MIT license: + +---------------------------------------------------------------------- +Copyright © 2005-2014 Rich Felker, et al. + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +---------------------------------------------------------------------- + +Authors/contributors include: + +Anthony G. Basile +Arvid Picciani +Bobby Bingham +Boris Brezillon +Brent Cook +Chris Spiegel +Clément Vasseur +Emil Renner Berthing +Hiltjo Posthuma +Isaac Dunham +Jens Gustedt +Jeremy Huntwork +John Spencer +Justin Cormack +Luca Barbato +Luka Perkov +M Farkas-Dyck (Strake) +Michael Forney +Nicholas J. Kain +orc +Pascal Cuoq +Pierre Carrier +Rich Felker +Richard Pennington +sin +Solar Designer +Stefan Kristiansson +Szabolcs Nagy +Timo Teräs +Valentin Ochs +William Haddon + +Portions of this software are derived from third-party works licensed +under terms compatible with the above MIT license: + +The TRE regular expression implementation (src/regex/reg* and +src/regex/tre*) is Copyright © 2001-2008 Ville Laurikari and licensed +under a 2-clause BSD license (license text in the source files). The +included version has been heavily modified by Rich Felker in 2012, in +the interests of size, simplicity, and namespace cleanliness. + +Much of the math library code (src/math/* and src/complex/*) is +Copyright © 1993,2004 Sun Microsystems or +Copyright © 2003-2011 David Schultz or +Copyright © 2003-2009 Steven G. Kargl or +Copyright © 2003-2009 Bruce D. Evans or +Copyright © 2008 Stephen L. Moshier +and labelled as such in comments in the individual source files. All +have been licensed under extremely permissive terms. + +The ARM memcpy code (src/string/armel/memcpy.s) is Copyright © 2008 +The Android Open Source Project and is licensed under a two-clause BSD +license. It was taken from Bionic libc, used on Android. + +The implementation of DES for crypt (src/misc/crypt_des.c) is +Copyright © 1994 David Burren. It is licensed under a BSD license. + +The implementation of blowfish crypt (src/misc/crypt_blowfish.c) was +originally written by Solar Designer and placed into the public +domain. The code also comes with a fallback permissive license for use +in jurisdictions that may not recognize the public domain. + +The smoothsort implementation (src/stdlib/qsort.c) is Copyright © 2011 +Valentin Ochs and is licensed under an MIT-style license. + +The BSD PRNG implementation (src/prng/random.c) and XSI search API +(src/search/*.c) functions are Copyright © 2011 Szabolcs Nagy and +licensed under following terms: "Permission to use, copy, modify, +and/or distribute this code for any purpose with or without fee is +hereby granted. There is no warranty." + +The x86_64 port was written by Nicholas J. Kain. Several files (crt) +were released into the public domain; others are licensed under the +standard MIT license terms at the top of this file. See individual +files for their copyright status. + +The mips and microblaze ports were originally written by Richard +Pennington for use in the ellcc project. The original code was adapted +by Rich Felker for build system and code conventions during upstream +integration. It is licensed under the standard MIT terms. + +The powerpc port was also originally written by Richard Pennington, +and later supplemented and integrated by John Spencer. It is licensed +under the standard MIT terms. + +All other files which have no copyright comments are original works +produced specifically for use as part of this library, written either +by Rich Felker, the main author of the library, or by one or more +contibutors listed above. Details on authorship of individual files +can be found in the git version control history of the project. The +omission of copyright and license comments in each file is in the +interest of source tree size. + +All public header files (include/* and arch/*/bits/*) should be +treated as Public Domain as they intentionally contain no content +which can be covered by copyright. Some source modules may fall in +this category as well. If you believe that a file is so trivial that +it should be in the Public Domain, please contact the authors and +request an explicit statement releasing it from copyright. + +The following files are trivial, believed not to be copyrightable in +the first place, and hereby explicitly released to the Public Domain: + +All public headers: include/*, arch/*/bits/* +Startup files: crt/* diff --git a/pandas/_libs/src/headers/portable.h b/pandas/_libs/src/headers/portable.h index b9868276ef6e6..9ac4ebc306baa 100644 --- a/pandas/_libs/src/headers/portable.h +++ b/pandas/_libs/src/headers/portable.h @@ -5,4 +5,10 @@ #define strcasecmp( s1, s2 ) _stricmp( s1, s2 ) #endif +// GH-23516 - works around locale perf issues +// from MUSL libc, MIT Licensed - see LICENSES +#define isdigit_ascii(c) ((unsigned)c - '0' < 10) +#define isspace_ascii(c) (c == ' ' || (unsigned)c-'\t' < 5) +#define toupper_ascii(c) (((unsigned)c-'a' < 26) ? (c & 0x5f) : c) + #endif diff --git a/pandas/_libs/src/parse_helper.h b/pandas/_libs/src/parse_helper.h index 4f9f825b15ffe..04f35d992dc6b 100644 --- a/pandas/_libs/src/parse_helper.h +++ b/pandas/_libs/src/parse_helper.h @@ -177,7 +177,7 @@ static double xstrtod(const char *str, char **endptr, char decimal, char sci, num_decimals = 0; // Process string of digits - while (isdigit(*p)) { + while (isdigit_ascii(*p)) { number = number * 10. + (*p - '0'); p++; num_digits++; @@ -188,7 +188,7 @@ static double xstrtod(const char *str, char **endptr, char decimal, char sci, *maybe_int = 0; p++; - while (isdigit(*p)) { + while (isdigit_ascii(*p)) { number = number * 10. + (*p - '0'); p++; num_digits++; @@ -222,7 +222,7 @@ static double xstrtod(const char *str, char **endptr, char decimal, char sci, // Process string of digits num_digits = 0; n = 0; - while (isdigit(*p)) { + while (isdigit_ascii(*p)) { n = n * 10 + (*p - '0'); num_digits++; p++; diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index e46e1e85f1c81..7eb0217a25cbd 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -23,6 +23,8 @@ GitHub. See Python Software Foundation License and BSD licenses for these. #include #include +#include "../headers/portable.h" // for isdigit_ascii + static void *safe_realloc(void *buffer, size_t size) { void *result; // OSX is weird. @@ -1411,7 +1413,7 @@ int tokenize_all_rows(parser_t *self) { } PANDAS_INLINE void uppercase(char *p) { - for (; *p; ++p) *p = toupper(*p); + for (; *p; ++p) *p = toupper_ascii(*p); } int PANDAS_INLINE to_longlong(char *item, long long *p_value) { @@ -1424,7 +1426,7 @@ int PANDAS_INLINE to_longlong(char *item, long long *p_value) { *p_value = strtoll(item, &p_end, 10); // Allow trailing spaces. - while (isspace(*p_end)) ++p_end; + while (isspace_ascii(*p_end)) ++p_end; return (errno == 0) && (!*p_end); } @@ -1541,7 +1543,7 @@ double xstrtod(const char *str, char **endptr, char decimal, char sci, errno = 0; // Skip leading whitespace. - while (isspace(*p)) p++; + while (isspace_ascii(*p)) p++; // Handle optional sign. negative = 0; @@ -1558,7 +1560,7 @@ double xstrtod(const char *str, char **endptr, char decimal, char sci, num_decimals = 0; // Process string of digits. - while (isdigit(*p)) { + while (isdigit_ascii(*p)) { number = number * 10. + (*p - '0'); p++; num_digits++; @@ -1570,7 +1572,7 @@ double xstrtod(const char *str, char **endptr, char decimal, char sci, if (*p == decimal) { p++; - while (isdigit(*p)) { + while (isdigit_ascii(*p)) { number = number * 10. + (*p - '0'); p++; num_digits++; @@ -1589,7 +1591,7 @@ double xstrtod(const char *str, char **endptr, char decimal, char sci, if (negative) number = -number; // Process an exponent string. - if (toupper(*p) == toupper(sci)) { + if (toupper_ascii(*p) == toupper_ascii(sci)) { // Handle optional sign. negative = 0; switch (*++p) { @@ -1602,7 +1604,7 @@ double xstrtod(const char *str, char **endptr, char decimal, char sci, // Process string of digits. num_digits = 0; n = 0; - while (isdigit(*p)) { + while (isdigit_ascii(*p)) { n = n * 10 + (*p - '0'); num_digits++; p++; @@ -1643,7 +1645,7 @@ double xstrtod(const char *str, char **endptr, char decimal, char sci, if (skip_trailing) { // Skip trailing whitespace. - while (isspace(*p)) p++; + while (isspace_ascii(*p)) p++; } if (endptr) *endptr = p; @@ -1697,7 +1699,7 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, errno = 0; // Skip leading whitespace. - while (isspace(*p)) p++; + while (isspace_ascii(*p)) p++; // Handle optional sign. negative = 0; @@ -1714,7 +1716,7 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, num_decimals = 0; // Process string of digits. - while (isdigit(*p)) { + while (isdigit_ascii(*p)) { if (num_digits < max_digits) { number = number * 10. + (*p - '0'); num_digits++; @@ -1730,7 +1732,7 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, if (*p == decimal) { p++; - while (num_digits < max_digits && isdigit(*p)) { + while (num_digits < max_digits && isdigit_ascii(*p)) { number = number * 10. + (*p - '0'); p++; num_digits++; @@ -1738,7 +1740,7 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, } if (num_digits >= max_digits) // Consume extra decimal digits. - while (isdigit(*p)) ++p; + while (isdigit_ascii(*p)) ++p; exponent -= num_decimals; } @@ -1752,7 +1754,7 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, if (negative) number = -number; // Process an exponent string. - if (toupper(*p) == toupper(sci)) { + if (toupper_ascii(*p) == toupper_ascii(sci)) { // Handle optional sign negative = 0; switch (*++p) { @@ -1765,7 +1767,7 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, // Process string of digits. num_digits = 0; n = 0; - while (isdigit(*p)) { + while (isdigit_ascii(*p)) { n = n * 10 + (*p - '0'); num_digits++; p++; @@ -1798,7 +1800,7 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, if (skip_trailing) { // Skip trailing whitespace. - while (isspace(*p)) p++; + while (isspace_ascii(*p)) p++; } if (endptr) *endptr = p; @@ -1833,7 +1835,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, int d; // Skip leading spaces. - while (isspace(*p)) { + while (isspace_ascii(*p)) { ++p; } @@ -1846,7 +1848,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, } // Check that there is a first digit. - if (!isdigit(*p)) { + if (!isdigit_ascii(*p)) { // Error... *error = ERROR_NO_DIGITS; return 0; @@ -1865,7 +1867,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, if (d == tsep) { d = *++p; continue; - } else if (!isdigit(d)) { + } else if (!isdigit_ascii(d)) { break; } if ((number > pre_min) || @@ -1878,7 +1880,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, } } } else { - while (isdigit(d)) { + while (isdigit_ascii(d)) { if ((number > pre_min) || ((number == pre_min) && (d - '0' <= dig_pre_min))) { number = number * 10 - (d - '0'); @@ -1902,7 +1904,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, if (d == tsep) { d = *++p; continue; - } else if (!isdigit(d)) { + } else if (!isdigit_ascii(d)) { break; } if ((number < pre_max) || @@ -1916,7 +1918,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, } } } else { - while (isdigit(d)) { + while (isdigit_ascii(d)) { if ((number < pre_max) || ((number == pre_max) && (d - '0' <= dig_pre_max))) { number = number * 10 + (d - '0'); @@ -1931,7 +1933,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, } // Skip trailing spaces. - while (isspace(*p)) { + while (isspace_ascii(*p)) { ++p; } @@ -1954,7 +1956,7 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, int d; // Skip leading spaces. - while (isspace(*p)) { + while (isspace_ascii(*p)) { ++p; } @@ -1968,7 +1970,7 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, } // Check that there is a first digit. - if (!isdigit(*p)) { + if (!isdigit_ascii(*p)) { // Error... *error = ERROR_NO_DIGITS; return 0; @@ -1984,7 +1986,7 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, if (d == tsep) { d = *++p; continue; - } else if (!isdigit(d)) { + } else if (!isdigit_ascii(d)) { break; } if ((number < pre_max) || @@ -1998,7 +2000,7 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, } } } else { - while (isdigit(d)) { + while (isdigit_ascii(d)) { if ((number < pre_max) || ((number == pre_max) && (d - '0' <= dig_pre_max))) { number = number * 10 + (d - '0'); @@ -2012,7 +2014,7 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, } // Skip trailing spaces. - while (isspace(*p)) { + while (isspace_ascii(*p)) { ++p; } From 7a5c391931453f0c03ec737e48fb1b7512d472fe Mon Sep 17 00:00:00 2001 From: Chris Bartak Date: Thu, 29 Nov 2018 09:49:25 -0600 Subject: [PATCH 3/5] whatsnew --- doc/source/whatsnew/v0.24.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index d4b2fefff322f..61fb84bf7e74a 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1182,6 +1182,7 @@ Performance Improvements - Improved performance of :func:`pd.concat` for `Series` objects (:issue:`23404`) - Improved performance of :meth:`DatetimeIndex.normalize` and :meth:`Timestamp.normalize` for timezone naive or UTC datetimes (:issue:`23634`) - Improved performance of :meth:`DatetimeIndex.tz_localize` and various ``DatetimeIndex`` attributes with dateutil UTC timezone (:issue:`23772`) +- Fixed a performance regression on Windows with Python 3.7 of :func:`pd.read_csv` (:issue:`23516`) .. _whatsnew_0240.docs: From f048ffffc79e916b6d47781307749079eb7f729a Mon Sep 17 00:00:00 2001 From: Chris Bartak Date: Mon, 3 Dec 2018 10:20:09 -0600 Subject: [PATCH 4/5] cpplint --- pandas/_libs/src/parser/tokenizer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 7eb0217a25cbd..3a4058f37efc7 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -23,7 +23,7 @@ GitHub. See Python Software Foundation License and BSD licenses for these. #include #include -#include "../headers/portable.h" // for isdigit_ascii +#include "../headers/portable.h" static void *safe_realloc(void *buffer, size_t size) { void *result; From 8af538ff05b4241cb4e4ace4459a840981fd9872 Mon Sep 17 00:00:00 2001 From: Chris Bartak Date: Mon, 3 Dec 2018 10:29:13 -0600 Subject: [PATCH 5/5] finish parse_helper --- pandas/_libs/src/parse_helper.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/src/parse_helper.h b/pandas/_libs/src/parse_helper.h index 04f35d992dc6b..b71131bee7008 100644 --- a/pandas/_libs/src/parse_helper.h +++ b/pandas/_libs/src/parse_helper.h @@ -138,11 +138,11 @@ int floatify(PyObject *str, double *result, int *maybe_int) { // PANDAS_INLINE void lowercase(char *p) { - for (; *p; ++p) *p = tolower(*p); + for (; *p; ++p) *p = tolower_ascii(*p); } PANDAS_INLINE void uppercase(char *p) { - for (; *p; ++p) *p = toupper(*p); + for (; *p; ++p) *p = toupper_ascii(*p); } static double xstrtod(const char *str, char **endptr, char decimal, char sci, @@ -207,7 +207,7 @@ static double xstrtod(const char *str, char **endptr, char decimal, char sci, if (negative) number = -number; // Process an exponent string - if (toupper(*p) == toupper(sci)) { + if (toupper_ascii(*p) == toupper_ascii(sci)) { *maybe_int = 0; // Handle optional sign @@ -263,7 +263,7 @@ static double xstrtod(const char *str, char **endptr, char decimal, char sci, if (skip_trailing) { // Skip trailing whitespace - while (isspace(*p)) p++; + while (isspace_ascii(*p)) p++; } if (endptr) *endptr = p;