bxparks · bxparks · Feb 15, 2019 · Feb 14, 2019 · Feb 14, 2019
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,9 @@
 # Changelog
 
+* Unreleased
+    * Add `--quoted_values_are_strings` flag to force quoted values (integers,
+      floats, booleans) to be interpreted as a `STRING`. (Thanks de-code@,
+      see #22).
 * 0.3.1 (2019-01-18)
     * Infer integers that overflow signed 64-bits to be `FLOAT` for
       consistency with `bq load`. (Fixes #18)

diff --git a/README.md b/README.md
@@ -164,6 +164,7 @@ The `generate_schema.py` script supports a handful of command line flags:
 
 * `--help` Prints the usage with the list of supported flags.
 * `--keep_nulls` Print the schema for null values, empty arrays or empty records.
+* `--quoted_values_are_strings` Quoted values should be interpreted as strings
 * `--debugging_interval lines` Number of lines between heartbeat debugging messages. Default 1000.
 * `--debugging_map` Print the metadata schema map for debugging purposes
 
@@ -183,6 +184,8 @@ optional arguments:
   -h, --help            show this help message and exit
   --keep_nulls          Print the schema for null values, empty arrays or
                         empty records.
+  --quoted_values_are_strings
+                        Quoted values should be interpreted as strings
   --debugging_interval DEBUGGING_INTERVAL
                         Number of lines between heartbeat debugging messages.
   --debugging_map       Print the metadata schema_map instead of the schema
@@ -236,6 +239,37 @@ INFO:root:Processed 1 lines
 ]
 ```
 
+#### Quoted Values Are Strings (`--quoted_values_are_strings`)
+
+By default, quoted values are inspected to determine if they can be interpreted
+as integers, floats or booleans. This is consistent with the algorithm used by
+`bq load`. However, sometimes this is not the desired behavior. This flag forces
+the `generate-schema` script to always interpret quoted values as a `STRING`.
+
+```
+$ generate-schema
+{ "name": "1" }
+^D
+[
+  {
+    "mode": "NULLABLE",
+    "name": "name",
+    "type": "INTEGER"
+  }
+]
+
+$ generate-schema --quoted_values_are_strings
+{ "name": "1" }
+^D
+[
+  {
+    "mode": "NULLABLE",
+    "name": "name",
+    "type": "STRING"
+  }
+]
+```
+
 #### Debugging Interval (`--debugging_interval`)
 
 By default, the `generate_schema.py` script prints a short progress message
@@ -333,10 +367,14 @@ compatibility rules implemented by **bq load**:
     * we follow the same logic as **bq load** and always infer these as
       `TIMESTAMP`
 * `BOOLEAN`, `INTEGER`, and `FLOAT` can appear inside quoted strings
-  * In other words, `"true"` (or `"True"` or `"false"`, etc) is considered a
-    BOOLEAN type, `"1"` is considered an INTEGER type, and `"2.1"` is considered
-    a FLOAT type. Luigi Mori (jtschichold@) added additional logic to replicate
-    the type conversion logic used by `bq load` for these strings.
+    * In other words, `"true"` (or `"True"` or `"false"`, etc) is considered a
+      BOOLEAN type, `"1"` is considered an INTEGER type, and `"2.1"` is
+      considered a FLOAT type. Luigi Mori (jtschichold@) added additional logic
+      to replicate the type conversion logic used by `bq load` for these
+      strings.
+    * This type inferrence inside quoted strings can be disabled using the
+      `--quoted_values_are_strings` flag
+    * (See [Issue #22](https://github.com/bxparks/bigquery-schema-generator/issues/22) for more details.)
 * `INTEGER` values overflowing a 64-bit signed integer upgrade to `FLOAT`
     * integers greater than `2^63-1` (9223372036854775807)
     * integers less than `-2^63` (-9223372036854775808)

diff --git a/bigquery_schema_generator/generate_schema.py b/bigquery_schema_generator/generate_schema.py
@@ -70,10 +70,12 @@ class SchemaGenerator:
 
     def __init__(self,
                  keep_nulls=False,
+                 quoted_values_are_strings=False,
                  debugging_interval=1000,
                  debugging_map=False):
-        self.debugging_interval = debugging_interval
         self.keep_nulls = keep_nulls
+        self.quoted_values_are_strings = quoted_values_are_strings
+        self.debugging_interval = debugging_interval
         self.debugging_map = debugging_map
         self.line_number = 0
         self.error_logs = []
@@ -347,16 +349,21 @@ def infer_value_type(self, value):
                 return 'DATE'
             elif self.TIME_MATCHER.match(value):
                 return 'TIME'
-            elif self.INTEGER_MATCHER.match(value):
-                if int(value) < self.INTEGER_MIN_VALUE or \
-                    self.INTEGER_MAX_VALUE < int(value):
+            elif not self.quoted_values_are_strings:
+                # Implement the same type inference algorithm as 'bq load' for
+                # quoted values that look like ints, floats or bools.
+                if self.INTEGER_MATCHER.match(value):
+                    if int(value) < self.INTEGER_MIN_VALUE or \
+                        self.INTEGER_MAX_VALUE < int(value):
+                        return 'QFLOAT'  # quoted float
+                    else:
+                        return 'QINTEGER'  # quoted integer
+                elif self.FLOAT_MATCHER.match(value):
                     return 'QFLOAT'  # quoted float
+                elif value.lower() in ['true', 'false']:
+                    return 'QBOOLEAN'  # quoted boolean
                 else:
-                    return 'QINTEGER'  # quoted integer
-            elif self.FLOAT_MATCHER.match(value):
-                return 'QFLOAT'  # quoted float
-            elif value.lower() in ['true', 'false']:
-                return 'QBOOLEAN'  # quoted boolean
+                    return 'STRING'
             else:
                 return 'STRING'
         # Python 'bool' is a subclass of 'int' so we must check it first
@@ -593,6 +600,10 @@ def main():
         '--keep_nulls',
         help='Print the schema for null values, empty arrays or empty records.',
         action="store_true")
+    parser.add_argument(
+        '--quoted_values_are_strings',
+        help='Quoted values should be interpreted as strings',
+        action="store_true")
     parser.add_argument(
         '--debugging_interval',
         help='Number of lines between heartbeat debugging messages.',
@@ -608,8 +619,11 @@ def main():
     # Configure logging.
     logging.basicConfig(level=logging.INFO)
 
-    generator = SchemaGenerator(args.keep_nulls, args.debugging_interval,
-                                args.debugging_map)
+    generator = SchemaGenerator(
+        keep_nulls=args.keep_nulls,
+        quoted_values_are_strings=args.quoted_values_are_strings,
+        debugging_interval=args.debugging_interval,
+        debugging_map=args.debugging_map)
     generator.run()
 
 

diff --git a/tests/test_generate_schema.py b/tests/test_generate_schema.py
@@ -162,6 +162,19 @@ def test_infer_value_type(self):
         self.assertEqual('__empty_array__', generator.infer_value_type([]))
         self.assertEqual('__array__', generator.infer_value_type([1, 2, 3]))
 
+    def test_quoted_values_are_strings(self):
+        generator = SchemaGenerator(quoted_values_are_strings=True)
+        self.assertEqual('STRING', generator.infer_value_type('abcd'))
+
+        self.assertEqual('INTEGER', generator.infer_value_type(1))
+        self.assertEqual('STRING', generator.infer_value_type('1'))
+
+        self.assertEqual('FLOAT', generator.infer_value_type(1.0))
+        self.assertEqual('STRING', generator.infer_value_type('1.0'))
+
+        self.assertEqual('BOOLEAN', generator.infer_value_type(True))
+        self.assertEqual('STRING', generator.infer_value_type('True'))
+
     def test_infer_bigquery_type(self):
         generator = SchemaGenerator()
 
@@ -442,6 +455,7 @@ def test(self):
     def verify_data_chunk(self, chunk_count, chunk):
         data_flags = chunk['data_flags']
         keep_nulls = ('keep_nulls' in data_flags)
+        quoted_values_are_strings = ('quoted_values_are_strings' in data_flags)
         records = chunk['records']
         expected_errors = chunk['errors']
         expected_error_map = chunk['error_map']
@@ -450,7 +464,8 @@ def verify_data_chunk(self, chunk_count, chunk):
         print("Test chunk %s: First record: %s" % (chunk_count, records[0]))
 
         # Generate schema.
-        generator = SchemaGenerator(keep_nulls)
+        generator = SchemaGenerator(keep_nulls=keep_nulls,
+            quoted_values_are_strings=quoted_values_are_strings)
         schema_map, error_logs = generator.deduce_schema(records)
         schema = generator.flatten_schema(schema_map)
 

diff --git a/tests/testdata.txt b/tests/testdata.txt
@@ -707,3 +707,29 @@ SCHEMA
   }
 ]
 END
+
+# Quoted values are forced to be strings if --quoted_values_are_strings flag
+# given
+DATA quoted_values_are_strings
+{ "qi" : "1", "qf": "1.0", "qb": "true" }
+{ "qi" : "2", "qf": "1.1", "qb": "True" }
+{ "qi" : "3", "qf": "2.0", "qb": "false" }
+SCHEMA
+[
+  {
+    "mode": "NULLABLE",
+    "name": "qi",
+    "type": "STRING"
+  },
+  {
+    "mode": "NULLABLE",
+    "name": "qf",
+    "type": "STRING"
+  },
+  {
+    "mode": "NULLABLE",
+    "name": "qb",
+    "type": "STRING"
+  }
+]
+END