Update with training info

Erotemic · Oct 15, 2023 · 15c01ad · 15c01ad
1 parent fae2795
commit 15c01ad
Show file tree

Hide file tree

Showing 7 changed files with 626 additions and 163 deletions.
diff --git a/README.rst b/README.rst
diff --git a/make_splits.py b/make_splits.py
@@ -0,0 +1,90 @@
+def make_splits():
+    import shitspotter
+    import kwcoco
+    coco_fpath = shitspotter.util.find_shit_coco_fpath()
+    dset = kwcoco.CocoDataset(coco_fpath)
+
+    gids_with_annots = [gid for gid, aids in dset.index.gid_to_aids.items() if len(aids) > 0]
+    images_with_annots = dset.images(gids_with_annots)
+
+    import ubelt as ub
+    from kwutil import util_time
+    datetimes = list(map(util_time.coerce_datetime, images_with_annots.lookup('datetime', None)))
+    year_to_gids = ub.group_items(images_with_annots, [d.year for d in datetimes])
+
+    # Group images into videos (do this with the image pairs)
+    for year, gids in year_to_gids.items():
+
+        video_name = f'video_{year}'
+        if video_name not in dset.index.name_to_video:
+            video_id = dset.add_video(name=video_name)
+        else:
+            video_id = dset.index.name_to_video[video_name]['id']
+
+        video = dset.index.videos[video_id]
+
+        video_images = dset.images(gids)
+
+        for idx, img in enumerate(video_images.objs):
+            img['frame_index'] = idx
+            img['video_id'] = video_id
+            img['sensor_coarse'] = 'phone'
+            img['datetime_captured'] = img['datetime']
+            img['channels'] = 'red|green|blue'
+
+            # hack
+            video['width'] = img['width']
+            video['height'] = img['height']
+
+    dset._build_index()
+    dset.conform()
+
+    vali_gids = []
+    train_gids = []
+
+    for year, gids in year_to_gids.items():
+        if year <= 2020:
+            vali_gids.extend(gids)
+        else:
+            train_gids.extend(gids)
+
+    groups = [g for k, g in sorted(year_to_gids.items())]
+    train_gids = list(ub.flatten(groups[1:]))
+    vali_gids = list(ub.flatten(groups[:1]))
+
+    train_split = dset.subset(train_gids)
+    vali_split = dset.subset(vali_gids)
+
+    def build_code(coco_dset):
+        hashid = coco_dset._build_hashid()[0:8]
+        return f'imgs{coco_dset.n_images}_{hashid}'
+
+    # coco_dset = vali_split
+    fname = ('vali_' + build_code(vali_split) + '.kwcoco.zip')
+    bundle_dpath = ub.Path(dset.fpath).parent
+    vali_split.fpath = bundle_dpath / fname
+
+    fname = ('train_' + build_code(train_split) + '.kwcoco.zip')
+    fname = ('train_' + build_code(train_split) + '.kwcoco.zip')
+    train_split.fpath = bundle_dpath / fname
+    print(f'vali_split.fpath={vali_split.fpath}')
+    print(f'train_split.fpath={train_split.fpath}')
+
+    train_split.conform()
+    vali_split.conform()
+
+    vali_split.dump()
+    train_split.dump()
+
+    ub.symlink(train_split.fpath, link_path=train_split.fpath.parent / 'train.kwcoco.zip', overwrite=True, verbose=3)
+    ub.symlink(vali_split.fpath, link_path=vali_split.fpath.parent / 'vali.kwcoco.zip', overwrite=True, verbose=3)
+
+    # See ~/code/ndsampler/train.sh
+
+
+if __name__ == '__main__':
+    """
+    CommandLine:
+        python ~/code/shitspotter/make_splits.py
+    """
+    make_splits()
diff --git a/shitspotter/cid_revisions.txt b/shitspotter/cid_revisions.txt
@@ -17,3 +17,4 @@ bafybeiflkm37altah2ey2jxko7kngquwfugyo4cl36y7xjf7o2lbrgucbi
 bafybeiczi4pn4na2iw7c66bpbf5rdr3ua3grp2qvjgrmnuzqabjjim4o2q
 bafybeiczi4pn4na2iw7c66bpbf5rdr3ua3grp2qvjgrmnuzqabjjim4o2q
 bafybeieahblb6aafomi72gnheu3ihom7nobdad4t6jcrrwhd5eb3wxkrgy
+bafybeief7tmoarwmd26b2petx7crtvdnz6ucccek5wpwxwdvfydanfukna
diff --git a/shitspotter/gather.py b/shitspotter/gather.py
@@ -35,6 +35,11 @@ def main():
     all_fpaths = []
     change_point = dateutil.parser.parse('2021-05-11T120000')
     walk_prog = ub.ProgIter(desc='walking')
+
+    extensions = set()
+
+    block_extensions = ('.mp4', '.json')
+
     with walk_prog:
         for r, ds, fs in os.walk(dpath, followlinks=True):
             walk_prog.step()
@@ -48,11 +53,18 @@ def main():
                 for fname in fs:
                     gpath = join(r, fname)
                     all_fpaths.append(gpath)
-                    if fname.endswith('.mp4'):
+                    if fname.endswith(block_extensions):
                         continue
                     if fname in seen:
                         print('SEEN fname = {!r}'.format(fname))
                         continue
+
+                    ext = fname.split('.')[-1]
+
+                    if ext == 'shitspotter':
+                        raise Exception
+
+                    extensions.add(ext)
                     seen.add(fname)
                     rows.append({
                         'gpath': gpath,
@@ -143,7 +155,10 @@ def main():
             raise
         # TODO: exif 'OffsetTime': '-05:00',
         row['datetime'] = dt.isoformat()
-        # exif_ori = exif.get('Orientation', None)
+
+        exif_ori = exif.get('Orientation', None)
+        row['exif_ori'] = exif_ori
+
         # print('exif_ori = {!r}'.format(exif_ori))
         geos_point = exif.get('GPSInfo', None)
         if geos_point is not None and 'GPSLatitude' in geos_point:
@@ -178,11 +193,131 @@ def main():
     coco_dset._ensure_json_serializable()
     print('coco_dset.fpath = {!r}'.format(coco_dset.fpath))
     coco_dset.reroot(absolute=False)
+    coco_dset.clear_annotations()
+
+    ADD_LABELME_ANNOTS = 1
+    if ADD_LABELME_ANNOTS:
+        import json
+        import kwimage
+        json_fpaths = sorted((dpath / 'assets').glob('*/*.json'))
+        for fpath in ub.ProgIter(json_fpaths):
+
+            if True:
+                # Fixup labelme json files
+                # Remove image data, fix bad labels
+                labelme_data = json.loads(fpath.read_text())
+                needs_write = 0
+                if labelme_data.get('imageData', None) is not None:
+                    labelme_data['imageData'] = None
+                    needs_write = 1
+
+                for shape in labelme_data['shapes']:
+                    if shape['label'] == 'poop;':
+                        shape['label'] = 'poop'
+
+                if needs_write:
+                    fpath.write_text(json.dumps(labelme_data))
+
+            labelme_data = json.loads(fpath.read_text())
+            imginfo, annsinfo = labelme_to_coco_structure(labelme_data)
+            image_name = imginfo['file_name'].rsplit('.', 1)[0]
+            img = coco_dset.index.name_to_img[image_name]
 
+            # Construct the inverted exif transform
+            # (From exif space -> raw space)
+            rot_ccw = 0
+            flip_axis = None
+            if img['exif_ori'] == 8:
+                rot_ccw = 3
+            elif img['exif_ori'] == 3:
+                rot_ccw = 2
+            elif img['exif_ori'] == 6:
+                rot_ccw = 1
+            elif img['exif_ori'] == 7:
+                flip_axis = 1
+                rot_ccw = 3
+            elif img['exif_ori'] == 4:
+                flip_axis = 1
+                rot_ccw = 2
+            elif img['exif_ori'] == 5:
+                flip_axis = 1
+                rot_ccw = 1
+            exif_canvas_dsize = (labelme_data['imageWidth'], labelme_data['imageHeight'])
+            inv_exif = kwimage.Affine.fliprot(
+                flip_axis=flip_axis, rot_k=rot_ccw,
+                canvas_dsize=exif_canvas_dsize
+            )
+
+            for ann in annsinfo:
+                ann = ann.copy()
+                poly = kwimage.Polygon.from_coco(ann['segmentation'])
+
+                if not inv_exif.isclose_identity():
+                    # if img['id'] not in {0}:
+                    #     raise Exception(img['id'])
+                    # LabelMe Polygons are annotated in EXIF space, but
+                    # we need them in raw space for kwcoco.
+                    poly = poly.warp(inv_exif)
+
+                ann['segmentation'] = poly.to_coco(style='new')
+                ann['bbox'] = poly.box().quantize().to_coco()
+
+                catname = ann.pop('category_name')
+                cid = coco_dset.ensure_category(catname)
+                ann['category_id'] = cid
+                ann['image_id'] = img['id']
+                coco_dset.add_annotation(**ann)
+
+            if 0:
+                import kwplot
+                kwplot.autompl(recheck=1, force='QtAgg')
+                if not inv_exif.isclose_identity():
+                    coco_dset.show_image(img['id'])
+                    if img['id'] not in {0, 1575, 7, 1554}:
+                        raise Exception(img['id'])
     #
     coco_dset.dump(coco_dset.fpath, newlines=True)
 
 
+def labelme_to_coco_structure(labelme_data):
+    import kwimage
+    import numpy as np
+    img = {
+        'file_name': labelme_data['imagePath'],
+        'width': labelme_data['imageWidth'],
+        'height': labelme_data['imageHeight'],
+    }
+    anns = []
+    for shape in labelme_data['shapes']:
+        points = shape['points']
+
+        if shape['group_id'] is not None:
+            raise NotImplementedError('groupid')
+
+        if shape['description']:
+            raise NotImplementedError('desc')
+        shape_type = shape['shape_type']
+
+        if shape_type != 'polygon':
+            raise NotImplementedError(shape_type)
+
+        flags = shape['flags']
+        if flags:
+            raise NotImplementedError('flags')
+
+        category_name = shape['label']
+        poly = kwimage.Polygon.coerce(np.array(points))
+
+        ann = {
+            'category_name': category_name,
+            'bbox': poly.box().quantize().to_coco(),
+            'segmentation': poly.to_coco(style='new'),
+        }
+        anns.append(ann)
+
+    return img, anns
+
+
 if __name__ == '__main__':
     """
     CommandLine:

diff --git a/shitspotter/matching.py b/shitspotter/matching.py
@@ -42,11 +42,17 @@ def autofind_pair_hueristic(coco_dset=None):
     # from vtool_ibeis.matching import VSONE_FEAT_CONFIG
 
     image_df = pd.DataFrame(coco_dset.dataset['images'])
-    ordered_gids = image_df.sort_values('datetime').id.tolist()
+
+    has_annots = [len(aids) > 0 for aids in coco_dset.images(image_df['id']).aids]
+    image_df['has_annots'] = has_annots
+    image_df = image_df.sort_values('datetime')
+
+    ordered_gids = image_df.id.tolist()
     feat_cfg = {
         'rotation_invariance': True,
         'affine_invariance': True,
     }
+    image_df = image_df.set_index('id', drop=False)
 
     # Fails on 31, 32
 
@@ -94,8 +100,8 @@ def matchable_image(gid):
         pair = (coco_img1['name'], coco_img2['name'])
         key = ub.urepr(pair, compact=1)
         if key not in existing_keys:
-            dt1 = dateutil.parser.parse(coco_img1['datetime'])
-            dt2 = dateutil.parser.parse(coco_img2['datetime'])
+            dt1 = coco_img1.datetime
+            dt2 = coco_img2.datetime
             delta = dt1 - dt2
             delta_seconds = delta.total_seconds()
             if delta_seconds < compare_time_thresh:
@@ -137,7 +143,7 @@ def matchable_image(gid):
                 key = ub.urepr((match['name1'], match['name2']), compact=1)
                 image_matches[key] = match
 
-    # Save the match table
+    # Save the match table shelf
     image_matches.sync()
 
     # coco_dset.dump(coco_dset.fpath, newlines=True)
@@ -197,6 +203,8 @@ def matchable_image(gid):
             good_pairwise_idxs.append(idx + 1)
             idx += 2
         else:
+            # import xdev
+            # xdev.embed()
             bad_pairwise_items += 1
             idx += 1
 
@@ -264,6 +272,11 @@ def matchable_image(gid):
     total_imgs = len(coco_dset.imgs)
     print(f'total_images = {total_imgs}')
 
+    num_images_with_annots = sum([bool(a) for a in coco_dset.images().annots])
+    num_annots = coco_dset.n_annots
+    print('num_images_with_annots = {}'.format(ub.urepr(num_images_with_annots, nl=1)))
+    print('num_annots = {}'.format(ub.urepr(num_annots, nl=1)))
+
     if 1:
         import datetime as datetime_mod
         today = datetime_mod.datetime.now().date()
@@ -272,12 +285,13 @@ def matchable_image(gid):
             '# Images': total_imgs,
             '# Estimated Groups': total_estimated_number_of_tups,
             '# Registered Groups': total_matchable_tups,
+            '# Annotated Images': num_images_with_annots,
         }
         print('New row for README')
-        print('| {:<12s}| {:<8s} | {:<18s}  | {:<22s}|'.format(*list(row.keys())))
-        print('+=============+==========+=====================+=======================+')
-        print('| {:<12s}|  {:<7d} |  ~{:<17d} | {:<22d}|'.format(*list(row.values())))
-        print('+-------------+----------+---------------------+-----------------------+')
+        print('| {:<12s}| {:<8s} | {:<18s}  | {:<22s}| {:<22s}|'.format(*list(row.keys())))
+        print('+=============+==========+=====================+=======================+=======================+')
+        print('| {:<12s}|  {:<7d} |  ~{:<17d} | {:<22d}| {:<22d}|'.format(*list(row.values())))
+        print('+-------------+----------+---------------------+-----------------------+-----------------------+')
         # import tabulate
         # import pandas as pd
         # df = pd.DataFrame([row])