diff --git a/easyaccess/easyaccess.py b/easyaccess/easyaccess.py index 68be11a..ac622dd 100755 --- a/easyaccess/easyaccess.py +++ b/easyaccess/easyaccess.py @@ -56,6 +56,10 @@ def colored(line, color): return line import webbrowser import signal +class KeyParser(argparse.ArgumentParser): + def error(self, message): + sys.exit(2) + logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p') @@ -1799,7 +1803,7 @@ def create_table(self, table, columns, dtypes): self.cur.execute(qtable) if self.autocommit: self.con.commit() - def insert_data(self, table, columns, values, dtypes=None): + def insert_data(self, table, columns, values, dtypes=None, niter = 0): """Insert data into a DB table. Trim trailing whitespace from string columns. Because of the @@ -1845,15 +1849,15 @@ def insert_data(self, table, columns, values, dtypes=None): raise cx_Oracle.DatabaseError(msg) print(colored( - '\n Inserted %d rows and %d columns into table %s in %.2f seconds' % ( - len(values), len(columns), table.upper(), t2 - t1), "green")) + '\n [Iter: %d] Inserted %d rows and %d columns into table %s in %.2f seconds' % ( + niter+1, len(values), len(columns), table.upper(), t2 - t1), "green")) def do_load_table(self, line, name=''): """ DB:Loads a table from a file (csv or fits) taking name from filename and columns from header - Usage: load_table + Usage: load_table [--tablename NAME] [--chunksize CHUNK] Ex: example.csv has the following content RA,DEC,MAG 1.23,0.13,23 @@ -1861,15 +1865,43 @@ def do_load_table(self, line, name=''): This command will create a table named EXAMPLE with 3 columns RA,DEC and MAG and values taken from file + Optional Arguments: + + --tablename NAME given name for the table, default is taken from filename + --chunksize CHUNK Number of rows to be inserted at a time. Useful for large files + that do not fit in memory + Note: - For csv or tab files, first line must have the column names (without # or any other comment) and same format as data (using ',' or space) - For fits file header must have columns names and data types - For filenames use .csv or .fits do not use extra points """ - filename = self.get_filename(line) + line = line.replace(';','') + load_parser = KeyParser(prog='', usage='', add_help=False) + load_parser.add_argument('filename', help='name for the file', action='store', default=None) + load_parser.add_argument('--tablename', help='name for the table', action='store', default='') + load_parser.add_argument('--chunksize', help='number of rows to read in blocks to avoid memory ' + 'issues', action='store', type=int, default=None) + load_parser.add_argument('-h', '--help', help='print help', action='store_true') + try: + load_args = load_parser.parse_args(line.split()) + except SystemExit: + self.do_help('load_table') + return + if load_args.help: + self.do_help('load_table') + return + filename = self.get_filename(load_args.filename) + name = load_args.tablename + chunk = load_args.chunksize if filename is None: return base, ext = os.path.splitext(os.path.basename(filename)) + if ext == '.h5' and chunk is not None: + print(colored("\nHDF5 file upload with chunksize is not supported yet. Try without " + "--chunksize\n","red")) + return + if name == '': table = base else: @@ -1882,34 +1914,85 @@ def do_load_table(self, line, name=''): return try: - data = self.load_data(filename) + data, iterator = self.load_data(filename) except: print_exception() return # Get the data in a way that Oracle understands - columns = data.ea_get_columns() - values = data.ea_get_values() - dtypes = data.ea_get_dtypes() - # Clean up the original object - del data + iteration = 0 + done = False + total_rows = 0 + if data.file_type == 'pandas': + while not done: + try: + if iterator: + df = data.get_chunk(chunk) + else: + df = data + df.file_type = 'pandas' + if len(df) == 0: break + if iteration == 0: + dtypes = eafile.get_dtypes(df) + columns = df.columns.values.tolist() + values = df.values.tolist() + total_rows += len(df) + except: + break + if iteration == 0: + try: + self.create_table(table, columns, dtypes) + except: + print_exception() + self.drop_table(table) + return + try: + if not done: + self.insert_data(table, columns, values, dtypes, iteration) + iteration += 1 + if not iterator: done =True + except: + print_exception() + self.drop_table(table) + return - try: - self.create_table(table, columns, dtypes) - except: - print_exception() - self.drop_table(table) - return + if data.file_type == 'fits': + if chunk is None: chunk = data[1].get_nrows() + start = 0 + while not done: + try: + df = data + if iteration == 0: + dtypes = eafile.get_dtypes(df) + columns = df[1].get_colnames() + values = df[1][start:start+chunk].tolist() + start += chunk + if len(values) == 0 : break + total_rows += len(values) + except: + break + if iteration == 0: + try: + self.create_table(table, columns, dtypes) + except: + print_exception() + self.drop_table(table) + return - try: - self.insert_data(table, columns, values, dtypes) - except: - print_exception() - self.drop_table(table) - return + try: + if not done: + self.insert_data(table, columns, values, dtypes, iteration) + iteration += 1 + except: + print_exception() + self.drop_table(table) + return - print(colored('\n Table %s loaded successfully.\n' % table.upper(), "green")) + + + print(colored('\n ** Table %s loaded successfully with %d rows.\n' % (table.upper(), total_rows), + "green")) print(colored(' You may want to refresh the metadata so your new table appears during\n autocompletion',"cyan")) print(colored(' DESDB ~> refresh_metadata_cache;',"cyan")) @@ -1927,24 +2010,53 @@ def do_append_table(self, line, name=''): """ DB:Appends a table from a file (csv or fits) taking name from filename and columns from header. - Usage: append_table + Usage: append_table [--tablename NAME] [--chunksize CHUNK] Ex: example.csv has the following content RA,DEC,MAG 1.23,0.13,23 0.13,0.01,22 This command will append the contents of example.csv to the table named EXAMPLE. + It is meant to use after load_table command + + Optional Arguments: + + --tablename NAME given name for the table, default is taken from filename + --chunksize CHUNK Number of rows to be inserted at a time. Useful for large files + that do not fit in memory Note: - For csv or tab files, first line must have the column names (without # or any other comment) and same format as data (using ',' or space) - For fits file header must have columns names and data types - For filenames use .csv or .fits do not use extra points """ - - filename = self.get_filename(line) + line = line.replace(';','') + append_parser = KeyParser(prog='', usage='', add_help=False) + append_parser.add_argument('filename', help='name for the file', action='store', default=None) + append_parser.add_argument('--tablename', help='name for the table to append to', action='store', default='') + append_parser.add_argument('--chunksize', help='number of rows to read in blocks to avoid memory ' + 'issues', action='store', default=None, type=int) + append_parser.add_argument('-h', '--help', help='print help', action='store_true') + try: + append_args = append_parser.parse_args(line.split()) + except SystemExit: + self.do_help('append_table') + return + if append_args.help: + self.do_help('append_table') + return + filename = self.get_filename(append_args.filename) + name = append_args.tablename + chunk = append_args.chunksize if filename is None: return base, ext = os.path.splitext(os.path.basename(filename)) + if ext == '.h5' and chunk is not None: + print(colored("\nHDF5 file upload with chunksize is not supported yet. Try without " + "--chunksize\n","red")) + return + + if name == '': table = base else: @@ -1956,23 +2068,65 @@ def do_append_table(self, line, name=''): '\n DESDB ~> CREATE TABLE %s (COL1 TYPE1(SIZE), ..., COLN TYPEN(SIZE));\n' % table.upper()) return try: - data = self.load_data(filename) + data, iterator = self.load_data(filename) except: print_exception() return - columns = data.ea_get_columns() - values = data.ea_get_values() - dtypes = data.ea_get_dtypes() - del data - try: - self.insert_data(table, columns, values, dtypes) - except: - print_exception() - return + iteration = 0 + done = False + total_rows = 0 + if data.file_type == 'pandas': + while not done: + try: + if iterator: + df = data.get_chunk(chunk) + else: + df = data + df.file_type = 'pandas' + if len(df) == 0: break + if iteration == 0: + dtypes = eafile.get_dtypes(df) + columns = df.columns.values.tolist() + values = df.values.tolist() + total_rows += len(df) + except: + break + try: + if not done: + self.insert_data(table, columns, values, dtypes, iteration) + iteration += 1 + if not iterator: done = True + except: + print_exception() + return + + if data.file_type == 'fits': + if chunk is None: chunk = data[1].get_nrows() + start = 0 + while not done: + try: + df = data + if iteration == 0: + dtypes = eafile.get_dtypes(df) + columns = df[1].get_colnames() + values = df[1][start:start+chunk].tolist() + start += chunk + if len(values) == 0 : break + total_rows += len(values) + except: + break + try: + if not done: + self.insert_data(table, columns, values, dtypes, iteration) + iteration += 1 + except: + print_exception() + return - print(colored('\n Table %s appended successfully.' % table.upper(), "green")) + print(colored('\n ** Table %s appended successfully with %d rows.' % (table.upper(), total_rows), + "green")) def complete_append_table(self, text, line, start_idx, end_idx): @@ -2254,10 +2408,18 @@ def initial_message(quiet=False, clear=True): help="Loads a sql command, execute it and exit") parser.add_argument("-lt", "--load_table", dest='loadtable', help="Loads data from a csv, tab, or fits formatted file \ - into a DB table using the filename as the table name") + into a DB table using the filename as the table name or a custom \ + name with --tablename MYTABLE") parser.add_argument("-at", "--append_table", dest='appendtable', help="Appends data from a csv, tab, or fits formatted file \ - into a DB table using the filename as the table name") + into a DB table using the filename as the table name or a custom \ + name with --tablename MYABLE") + parser.add_argument("--tablename", dest='tablename', + help="Custom table name to be used with --load_table\ + or --append_table") + parser.add_argument("--chunksize", dest='chunksize', type=int, default = None, + help="Number of rows to be inserted at a time. Useful for large files \ + that do not fit in memory. Use with --load_table") parser.add_argument("-s", "--db",dest='db', #choices=[...]? help="Override database name [dessci,desoper,destest]") parser.add_argument("-q", "--quiet", action="store_true", dest='quiet', @@ -2377,12 +2539,20 @@ def colored(line, color): return line initial_message(args.quiet, clear=False) cmdinterp = easy_or(conf, desconf, db, interactive=False, quiet=args.quiet) linein = "load_table " + args.loadtable + if args.tablename is not None: + linein += ' --tablename ' + args.tablename + if args.chunksize is not None: + linein += ' --chunksize ' + str(args.chunksize) cmdinterp.onecmd(linein) os._exit(0) elif args.appendtable is not None: initial_message(args.quiet, clear=False) cmdinterp = easy_or(conf, desconf, db, interactive=False, quiet=args.quiet) linein = "append_table " + args.appendtable + if args.tablename is not None: + linein += ' --tablename ' + args.tablename + if args.chunksize is not None: + linein += ' --chunksize ' + str(args.chunksize) cmdinterp.onecmd(linein) os._exit(0) else: diff --git a/easyaccess/eautils/fileio.py b/easyaccess/eautils/fileio.py index 38436ec..2747ded 100644 --- a/easyaccess/eautils/fileio.py +++ b/easyaccess/eautils/fileio.py @@ -274,6 +274,21 @@ def read_file(filename): return data +def get_dtypes(df): + """ + Get dtypes from pandas DataFrame or fitsio.FITS + """ + if df.file_type == 'pandas': + dtypes = [df[c].dtype if df[c].dtype.kind != 'O' + else np.dtype('S' + str(max(df[c].str.len()))) + for i, c in enumerate(df)] + + if df.file_type == 'fits': + dtype = df[1].get_rec_dtype(vstorage='fixed')[0] + dtypes = [dtype[i] for i, d in enumerate(dtype.descr)] + return dtypes + + def read_pandas(filename): """ Read an input file into a pandas DataFrame. Accepted file @@ -295,24 +310,27 @@ def read_pandas(filename): if ext in ('.csv', '.tab'): if ext == '.csv': sepa = ',' if ext == '.tab': sepa = None - df = pd.read_csv(filename, sep=sepa) + df = pd.read_csv(filename, sep=sepa, iterator = True) + iterator = True elif ext in ('.h5'): - df = pd.read_hdf(filename, key='data') + df = pd.read_hdf(filename, key='data') # iterator for hdf in padnas 0.18 + iterator = False except: msg = 'Problem reading %s\n' % filename raise IOError(msg) # Monkey patch to grab columns and values # List comprehension is faster but less readable - dtypes = [df[c].dtype if df[c].dtype.kind != 'O' - else np.dtype('S' + str(max(df[c].str.len()))) - for i, c in enumerate(df)] + #dtypes = [df[c].dtype if df[c].dtype.kind != 'O' + # else np.dtype('S' + str(max(df[c].str.len()))) + # for i, c in enumerate(df)] - df.ea_get_columns = df.columns.values.tolist - df.ea_get_values = df.values.tolist - df.ea_get_dtypes = lambda: dtypes + #df.ea_get_columns = df.columns.values.tolist + #df.ea_get_values = df.values.tolist + #df.ea_get_dtypes = lambda: dtypes + df.file_type = 'pandas' - return df + return df, iterator def read_fitsio(filename): @@ -334,12 +352,13 @@ def read_fitsio(filename): msg = 'Problem reading %s\n' % filename raise IOError(msg) # Monkey patch to grab columns and values - dtype = fits[1].get_rec_dtype(vstorage='fixed')[0] - dtypes = [dtype[i] for i, d in enumerate(dtype.descr)] + #dtype = fits[1].get_rec_dtype(vstorage='fixed')[0] + #dtypes = [dtype[i] for i, d in enumerate(dtype.descr)] - fits.ea_get_columns = fits[1].get_colnames - fits.ea_get_values = fits[1].read().tolist - fits.ea_get_dtypes = lambda: dtypes + #fits.ea_get_columns = fits[1].get_colnames + #fits.ea_get_values = fits[1].read().tolist + #fits.ea_get_dtypes = lambda: dtypes + fits.file_type = 'fits' # ## # Hack to just get a subset of columns ### x1,x2 = 25,37 @@ -349,7 +368,7 @@ def read_fitsio(filename): ### fits.ea_get_values = lambda: fits[1].read(columns=fits.ea_get_columns()).tolist() ### fits.ea_get_dtypes = lambda: dtypes[x1:x2] - return fits + return fits, True if __name__ == "__main__":