Change config & CSV format #29, #30

- Add min_values/max_values in place of valid_range - CSV file has header - Use Pandas to parse CSV (so add as requirement) - Update examples - Bump version - Check for implementation of YATSM algorithm - Put YATSM algo class in config
ceholden · Aug 25, 2015 · f38306e · f38306e
1 parent 498e8ed
commit f38306e
Show file tree

Hide file tree

Showing 10 changed files with 66 additions and 67 deletions.
diff --git a/examples/p022r049_example.yaml b/examples/p022r049_example.yaml
@@ -24,7 +24,8 @@ dataset:
     mask_values: [2, 3, 4, 255]
     # Valid range of non-mask band data
     # specify 1 range for all bands, or specify ranges for each band
-    valid_range: [0, 10000]
+    min_values: 0
+    max_values: 10000
     # Indices for multi-temporal cloud masking (indexed on 1)
     green_band: 2
     swir1_band: 5
@@ -62,7 +63,7 @@ LassoCV:
 
 # Section for phenology fitting
 phenology:
-    calc_pheno: False
+    enable: False
     # Specification for dataset indices required for EVI based phenology monitoring
     red_index: 2
     nir_index: 3

diff --git a/examples/p022r049_input.csv b/examples/p022r049_input.csv
@@ -1,3 +1,4 @@
+date,sensor,filename
 1993009,LT5,/home/ceholden/Documents/landsat_stack/p022r049/images/LT50220491993009XXX04/LT50220491993009XXX04_stack
 1993025,LT5,/home/ceholden/Documents/landsat_stack/p022r049/images/LT50220491993025AAA04/LT50220491993025AAA04_stack
 1993041,LT5,/home/ceholden/Documents/landsat_stack/p022r049/images/LT50220491993041AAA04/LT50220491993041AAA04_stack

diff --git a/examples/p035r032_example.yaml b/examples/p035r032_example.yaml
@@ -23,7 +23,8 @@ dataset:
     mask_values: [2, 3, 4, 255]
     # Valid range of non-mask band data
     # specify 1 range for all bands, or specify ranges for each band
-    valid_range: [0, 10000]
+    min_values: 0
+    max_values: 10000
     # Indices for multi-temporal cloud masking (indexed on 1)
     green_band: 2
     swir1_band: 5
@@ -67,7 +68,7 @@ OLS:
 
 # Section for phenology fitting
 phenology:
-    calc_pheno: False
+    enable: False
     # Specification for dataset indices required for EVI based phenology monitoring
     red_index: 2
     nir_index: 3

diff --git a/examples/p035r032_input.csv b/examples/p035r032_input.csv
@@ -1,3 +1,4 @@
+date,sensor,filename
 1984108,LT5,/home/ceholden/Documents/landsat_stack/p035r032/images/LT50350321984108XXX01/LT50350321984108XXX01_stack.gtif
 1984140,LT5,/home/ceholden/Documents/landsat_stack/p035r032/images/LT50350321984140PAC00/LT50350321984140PAC00_stack.gtif
 1984156,LT5,/home/ceholden/Documents/landsat_stack/p035r032/images/LT50350321984156PAC00/LT50350321984156PAC00_stack.gtif

diff --git a/requirements.txt b/requirements.txt
@@ -8,5 +8,6 @@ matplotlib>=1.4.2
 click>=4.0
 click_plugins>=1.0
 palettable>=2.0.0
+pandas>=0.16.0
 patsy>=0.3.0
 pyyaml>=3.11
diff --git a/scripts/gen_date_file.sh b/scripts/gen_date_file.sh
@@ -31,6 +31,8 @@ EOF
 }
 
 function main() {
+    # Header
+    echo "date,sensor,filename" > $output
 
     images=$(find $root -follow -name "$pattern")
     nimages=$(echo $images | awk '{ print NF }')
@@ -56,7 +58,7 @@ function main() {
         sensor=${id:$sstart:3}
 
         echo "$ydoy,$sensor,$name"
-    done | sort > $output
+    done | sort >> $output
 
 }
 

diff --git a/yatsm/algorithms/__init__.py b/yatsm/algorithms/__init__.py
@@ -4,3 +4,6 @@
     - ccdc.CCDCesque
 
 """
+from .ccdc import CCDCesque
+
+available = ['CCDCesque']
diff --git a/yatsm/config_parser.py b/yatsm/config_parser.py
@@ -1,12 +1,14 @@
+import inspect
 import StringIO
-import yaml
 
 import numpy as np
 import sklearn.linear_model
 import sklearn.externals.joblib
+import yaml
 
-from log_yatsm import logger
-from version import __version__
+from . import algorithms
+from .log_yatsm import logger
+from .version import __version__
 
 
 def parse_config_file(config_file):
@@ -29,24 +31,42 @@ def parse_config_file(config_file):
         cfg = yaml.safe_load(f)
 
     # Ensure algorithm & prediction sections are specified
-    if 'YATSM' not in cfg.keys():
+    if 'YATSM' not in cfg:
         raise KeyError('YATSM must be a section in configuration YAML file')
 
     if 'algorithm' not in cfg['YATSM']:
         raise KeyError('YATSM section does not declare an algorithm')
-    if cfg['YATSM']['algorithm'] not in cfg.keys():
+    algo = cfg['YATSM']['algorithm']
+    if algo not in cfg:
         raise KeyError('Algorithm specified (%s) is not parameterized in '
-                       'configuration file' % cfg['YATSM']['algorithm'])
+                       'configuration file' % algo)
 
-    if 'prediction' not in cfg['YATSM'].keys():
+    if 'prediction' not in cfg['YATSM']:
         raise KeyError('YATSM section does not declare a prediction method')
-    if cfg['YATSM']['prediction'] not in cfg.keys():
+    if cfg['YATSM']['prediction'] not in cfg:
         raise KeyError('Prediction method specified (%s) is not parameterized '
                        'in configuration file' % cfg['YATSM']['prediction'])
 
+    # Embed algorithm in YATSM key
+    if algo not in algorithms.available:
+        raise NotImplementedError('Algorithm specified (%s) is not currently '
+                                  'available' % algo)
+    cfg['YATSM']['algorithm_cls'] = getattr(algorithms, algo)
+    if not cfg['YATSM']['algorithm_cls']:
+        raise KeyError('Could not find algorithm specified (%s) in '
+                       '`yatsm.algorithms`' % algo)
+
+    # Expand min/max values to all bands
+    n_bands = cfg['dataset']['n_bands']
+    mins, maxes = cfg['dataset']['min_values'], cfg['dataset']['max_values']
+    if isinstance(mins, (float, int)):
+        cfg['dataset']['min_values'] = np.asarray([mins] * n_bands)
+    if isinstance(maxes, (float, int)):
+        cfg['dataset']['max_values'] = np.asarray([maxes] * n_bands)
+
     # Add in dummy phenology and classification dicts if not included
     if 'phenology' not in cfg:
-        cfg['phenology'] = {'calc_pheno': False}
+        cfg['phenology'] = {'enable': False}
 
     if 'classification' not in cfg:
         cfg['classification'] = {'training_image': None}

diff --git a/yatsm/utils.py b/yatsm/utils.py
@@ -1,12 +1,12 @@
 from __future__ import division
 
-import csv
 from datetime import datetime as dt
 import fnmatch
 import os
 import sys
 
 import numpy as np
+import pandas as pd
 
 from log_yatsm import logger
 
@@ -46,8 +46,10 @@ def distribute_jobs(job_number, total_jobs, n, interlaced=True):
         tasks = np.arange(i_start, min(i_end, n))
 
     if tasks.size == 0:
-        raise ValueError('No jobs assigned for job_number/total_jobs: {j}/{t}'.
-                         format(j=job_number, t=total_jobs))
+        raise ValueError(
+            'No jobs assigned for job_number/total_jobs: {j}/{t}'.format(
+                j=job_number,
+                t=total_jobs))
 
     return tasks
 
@@ -64,67 +66,36 @@ def get_output_name(dataset_config, line):
 
     """
     return os.path.join(dataset_config['output'],
-                        '{pref}{line}.npz'.format(
-                            pref=dataset_config['output_prefix'],
-                            line=line))
+                        '%s%s.npz' % (dataset_config['output_prefix'], line))
 
 
 # IMAGE DATASET READING
-def csvfile_to_dataset(input_file, date_format='%Y-%j'):
+def csvfile_to_dataset(input_file, date_format='%Y%j'):
     """ Return sorted filenames of images from input text file
 
     Args:
       input_file (str): text file of dates and files
       date_format (str): format of dates in file
 
     Returns:
-      dict: dates, sensor IDs, and filenames of stacked images as np.ndarray
-        within a dict
+      dict: pd.DataFrame of dates, sensor IDs, and filenames
 
     """
-    # Store index of date and image
-    i_date = 0
-    i_sensor = 1
-    i_image = 2
+    df = pd.read_csv(input_file)
 
-    dates = []
-    images = []
-    sensors = []
+    # Guess and convert date field
+    date_col = [i for i, n in enumerate(df.columns) if 'date' in n.lower()]
+    if not date_col:
+        raise KeyError('Could not find date column in input file')
+    if len(date_col) > 1:
+        logger.warning('Multiple date columns found in input CSV file. '
+                       'Using %s' % df.columns[date_col[0]])
+    date_col = df.columns[date_col[0]]
 
-    logger.debug('Opening image dataset file')
-    with open(input_file, 'rb') as f:
-        reader = csv.reader(f)
+    df[date_col] = pd.to_datetime(
+        df[date_col], format=date_format).map(lambda x: dt.toordinal(x))
 
-        # Figure out which index is for what
-        row = reader.next()
-
-        try:
-            dt.strptime(row[i_date], date_format).toordinal()
-        except:
-            logger.debug('Could not parse first column to ordinal date')
-            try:
-                dt.strptime(row[i_sensor], date_format).toordinal()
-            except:
-                logger.debug('Could not parse second column to ordinal date')
-                logger.error('Could not parse any columns to ordinal date')
-                logger.error('Input dataset file: {f}'.format(f=input_file))
-                logger.error('Date format: {f}'.format(f=date_format))
-                raise
-            else:
-                i_date = 1
-                i_sensor = 0
-
-        f.seek(0)
-
-        logger.debug('Reading in image date, sensor, and filenames')
-        for row in reader:
-            dates.append(dt.strptime(row[i_date], date_format).toordinal())
-            sensors.append(row[i_sensor])
-            images.append(row[i_image])
-
-        return {'dates': np.array(dates),
-                'sensors': np.array(sensors),
-                'images': np.array(images)}
+    return df
 
 
 def get_image_IDs(filenames):
@@ -183,9 +154,7 @@ def write_output(raster, output, image_ds, gdal_frmt, ndv, band_names=None):
 
         if band_names is not None:
             ds.GetRasterBand(1).SetDescription(band_names[0])
-            ds.GetRasterBand(1).SetMetadata({
-                'band_1': band_names[0]
-            })
+            ds.GetRasterBand(1).SetMetadata({'band_1': band_names[0]})
 
     ds.SetProjection(image_ds.GetProjection())
     ds.SetGeoTransform(image_ds.GetGeoTransform())

diff --git a/yatsm/version.py b/yatsm/version.py
@@ -1 +1 @@
-__version__ = '0.4.0'
+__version__ = '0.5.0b'