Initial commit of blog post for video classificatoin methods.

41d3f8a5 · Matt Harvey · 41d3f8a5 · 41d3f8a5 · 41d3f8a5 · 41d3f8a5
Commit 41d3f8a5 authored 8 years ago by Matt Harvey
Hide whitespace changes
Inline Side-by-side

Showing

with 1127 additions and 0 deletions
+1127 -0
--- a/.gitignore
+++ b/.gitignore
+*jpg
+*rar
+*zip
+*gz
+*avi
+*swp
+data/logs/*
+data/checkpoints/*
+data/sequences/*
+data/train/*
+data/test/*
--- a/README.md
+++ b/README.md
+# Five video classification methods
+
+The five video classification methods:
+
+1. Classify one frame at a time with a ConvNet
+1. Extract features from each frame with a ConvNet, passing the sequence to an RNN, in a separate network
+1. Use a time-dstirbuted ConvNet, passing the features to an RNN, much like #2 but all in one network
+1. Extract features from each frame with a ConvNet and pass the sequence to an MLP
+1. Use a 3D convolutional network
+
+See the accompanying blog post for full details: TK
+
+## Getting the data
+
+First, download the dataset from UCF into the `data` folder:
+
+`cd data && wget http://crcv.ucf.edu/data/UCF101/UCF101.rar`
+
+Then extract it with `unrar -e UCF101.rar`.
+
+Next, create train and test folders with `mkdir train && mkdir test`.
+
+Now you can run the scripts in the data folder to move the videos to the appropriate place, extract their frames and make the CSV file the rest of the code references.
+
+## Extracting features
+
+Before you can run Methods #2 and #4, you need to extract features from the images with the CNN. This is done by running `extract_features.py`. On my Dell with a GeFore 960m GPU, this takes about 8 hours. If you want to limit to just the first N classes, you can set that option in the file.
+
+## Running models
+
+The CNN-only method (method #1 in the blog post) is run from `train_cnn.py`.
+
+The rest of the models are run from `train.py`. There are configuration options you can set in that file to choose which model you want to run.
+
+The models are all defined in `models.py`. Reference that file to see which models you are able to run in `train.py`.
+
+### UCF101 Citation
+
+Khurram Soomro, Amir Roshan Zamir and Mubarak Shah, UCF101: A Dataset of 101 Human Action Classes From Videos in The Wild., CRCV-TR-12-01, November, 2012. 
+
--- a/__pycache__/processor.cpython-35.pyc
+++ b/__pycache__/processor.cpython-35.pyc
--- a/data.py
+++ b/data.py
+"""
+Class for managing our data.
+"""
+import csv
+import numpy as np
+import random
+import glob
+import os.path
+import pandas as pd
+import sys
+import operator
+from processor import process_image
+from keras.utils import np_utils
+
+class DataSet():
+
+    def __init__(self, seq_length=40, class_limit=None, image_shape=(224, 224, 3)):
+        """Constructor.
+        seq_length = (int) the number of frames to consider
+        class_limit = (int) number of classes to limit the data to.
+            None = no limit.
+        """
+        self.seq_length = seq_length
+        self.class_limit = class_limit
+        self.sequence_path = './data/sequences/'
+        self.max_frames = 300  # max number of frames a video can have for us to use it
+
+        # Get the data.
+        self.data = self.get_data()
+
+        # Get the classes.
+        self.classes = self.get_classes()
+
+        # Now do some minor data cleaning.
+        self.data = self.clean_data()
+
+        self.image_shape = image_shape
+
+    @staticmethod
+    def get_data():
+        """Load our data from file."""
+        with open('./data/data_file.csv', 'r') as fin:
+            reader = csv.reader(fin)
+            data = list(reader)
+
+        return data
+
+    def clean_data(self):
+        """Limit samples to greater than the sequence length and fewer
+        than N frames. Also limit it to classes we want to use."""
+        data_clean = []
+        for item in self.data:
+            if int(item[3]) >= self.seq_length and int(item[3]) <= self.max_frames \
+                    and item[1] in self.classes:
+                data_clean.append(item)
+
+        return data_clean
+
+    def get_classes(self):
+        """Extract the classes from our data. If we want to limit them,
+        only return the classes we need."""
+        classes = []
+        for item in self.data:
+            if item[1] not in classes:
+                classes.append(item[1])
+
+        # Sort them.
+        classes = sorted(classes)
+
+        # Return.
+        if self.class_limit is not None:
+            return classes[:self.class_limit]
+        else:
+            return classes
+
+    def get_class_one_hot(self, class_str):
+        """Given a class as a string, return its number in the classes
+        list. This lets us encode and one-hot it for training."""
+        # Encode it first.
+        label_encoded = self.classes.index(class_str)
+
+        # Now one-hot it.
+        label_hot = np_utils.to_categorical(label_encoded, len(self.classes))
+        label_hot = label_hot[0]  # just get a single row
+
+        return label_hot
+
+    def split_train_test(self):
+        """Split the data into train and test groups."""
+        train = []
+        test = []
+        for item in self.data:
+            if item[0] == 'train':
+                train.append(item)
+            else:
+                test.append(item)
+        return train, test
+
+    def frame_generator(self, batch_size, train_test, data_type, concat=False):
+        """Return a generator that we can use to train on. There are
+        a couple different things we can return:
+
+        data_type: 'features', 'images'
+        """
+        # Get the right dataset for the generator.
+        train, test = self.split_train_test()
+        data = train if train_test == 'train' else test
+
+        print("Creating %s generator with %d samples." % (train_test, len(data)))
+
+        while 1:
+            X, y = [], []
+
+            # Generate batch_size samples.
+            for i in range(batch_size):
+                # Reset to be safe.
+                sequence = None
+
+                # Get a random sample.
+                sample = random.choice(data)
+
+                # Check to see if we've already saved this sequence.
+                if data_type is "images":
+                    # Get and resample frames.
+                    frames = self.get_frames_for_sample(sample)
+                    frames = self.rescale_list(frames, self.seq_length)
+
+                    # Build the image sequence
+                    sequence = self.build_image_sequence(frames)
+                else:
+                    # Get the sequence from disk.
+                    sequence = self.get_extracted_sequence(sample)
+
+                if sequence is None:
+                    print("Can't find sequence. Did you generate them?")
+                    sys.exit()  # TODO this should raise
+
+                if concat:
+                    # We want to pass the sequence back as a single array. This
+                    # is used to pass into an MLP rather than an RNN.
+                    sequence = np.concatenate(sequence).ravel()
+
+                X.append(sequence)
+                y.append(self.get_class_one_hot(sample[1]))
+
+            yield np.array(X), np.array(y)
+
+    def build_image_sequence(self, frames):
+        """Given a set of frames (filenames), build our sequence."""
+        return [process_image(x, self.image_shape) for x in frames]
+
+    def get_extracted_sequence(self, sample):
+        """Get the saved extracted features."""
+        filename = sample[2]
+        path = self.sequence_path + filename + '-' + str(self.seq_length) + \
+            '-features.txt'
+        if os.path.isfile(path):
+            # Use a dataframe/read_csv for speed increase over numpy.
+            features = pd.read_csv(path, sep=" ", header=None)
+            return features.values
+        else:
+            return None
+
+    @staticmethod
+    def get_frames_for_sample(sample):
+        """Given a sample row from the data file, get all the corresponding frame
+        filenames."""
+        path = './data/' + sample[0] + '/' + sample[1] + '/'
+        filename = sample[2]
+        images = sorted(glob.glob(path + filename + '*jpg'))
+        return images
+
+    @staticmethod
+    def get_filename_from_image(filename):
+        parts = filename.split('/')
+        return parts[-1].replace('.jpg', '')
+
+    @staticmethod
+    def rescale_list(input_list, size):
+        """Given a list and a size, return a rescaled/samples list. For example,
+        if we want a list of size 5 and we have a list of size 25, return a new
+        list of size five which is every 5th element of the origina list."""
+        assert len(input_list) >= size
+
+        # Get the number to skip between iterations.
+        skip = len(input_list) // size
+
+        # Build our new output.
+        output = [input_list[i] for i in range(0, len(input_list), skip)]
+
+        # Cut off the last one if needed.
+        return output[:size]
+
+    @staticmethod
+    def print_class_from_prediction(predictions, nb_to_return=5):
+        """Given a prediction, print the top classes."""
+        # Get the prediction for each label.
+        label_predictions = {}
+        for i, label in enumerate(data.classes):
+            label_predictions[label] = predictions[i]
+
+        # Now sort them.
+        sorted_lps = sorted(
+            label_predictions.items(),
+            key=operator.itemgetter(1),
+            reverse=True
+        )
+
+        # And return the top N.
+        for i, class_prediction in enumerate(sorted_lps):
+            if i > nb_to_return - 1 or class_prediction[1] == 0.0:
+                break
+            print("%s: %.2f" % (class_prediction[0], class_prediction[1]))
+
+if __name__ == '__main__':
+    from PIL import Image
+    data = DataSet(seq_length=20, class_limit=3)
+    print(data.classes)
+    gen = data.frame_generator(32, 'train', 'images', False)
+    for x, y in gen:
+        # x should be 32, with each having seq_ength images.
+        for j, row in enumerate(x):
+            for i, image in enumerate(row):
+                image *= 255
+                actual = Image.fromarray(image.astype('uint8'))
+                actual.save('tmp/whatever-' + str(i) + '.jpg')
+            print(y[j])
+            data.print_class_from_prediction(y[j])
+            break
+        break
--- a/data/1_move_files.py
+++ b/data/1_move_files.py
+"""
+After extracting the RAR, we run this to move all the files into
+the appropriate train/test folders.
+
+Should only run this file once!
+"""
+import os
+import os.path
+
+def get_train_test_lists(version='01'):
+    """
+    Using one of the train/test files (01, 02, or 03), get the filename
+    breakdowns we'll later use to move everything.
+    """
+    # Get our files based on version.
+    test_file = './ucfTrainTestlist/testlist' + version + '.txt'
+    train_file = './ucfTrainTestlist/trainlist' + version + '.txt'
+
+    # Build the test list.
+    with open(test_file) as fin:
+        test_list = [row.strip() for row in list(fin)]
+
+    # Build the train list. Extra step to remove the class index.
+    with open(train_file) as fin:
+        train_list = [row.strip() for row in list(fin)]
+        train_list = [row.split(' ')[0] for row in train_list]
+
+    # Set the groups in a dictionary.
+    file_groups = {
+        'train': train_list,
+        'test': test_list
+    }
+
+    return file_groups
+
+def move_files(file_groups):
+    """This assumes all of our files are currently in _this_ directory.
+    So move them to the appropriate spot. Only needs to happen once.
+    """
+    # Do each of our groups.
+    for group, videos in file_groups.items():
+
+        # Do each of our videos.
+        for video in videos:
+
+            # Get the parts.
+            parts = video.split('/')
+            classname = parts[0]
+            filename = parts[1]
+
+            # Check if this class exists.
+            if not os.path.exists(group + '/' + classname):
+                print("Creating folder for %s/%s" % (group, classname))
+                os.makedirs(group + '/' + classname)
+
+            # Check if we have already moved this file, or at least that it
+            # exists to move.
+            if not os.path.exists(filename):
+                print("Can't find %s to move. Skipping." % (filename))
+                continue
+
+            # Move it.
+            dest = group + '/' + classname + '/' + filename
+            print("Moving %s to %s" % (filename, dest))
+            os.rename(filename, dest)
+
+    print("Done.")
+
+def main():
+    """
+    Go through each of our train/test text files and move the videos
+    to the right place.
+    """
+    # Get the videos in groups so we can move them.
+    group_lists = get_train_test_lists()
+
+    # Move the files.
+    move_files(group_lists)
+
+if __name__ == '__main__':
+    main()
--- a/data/2_extract_files.py
+++ b/data/2_extract_files.py
+"""
+After moving all the files using the 1_ file, we run this one to extract
+the images from the videos and also create a data file we can use
+for training and testing later.
+"""
+import csv
+import glob
+import os
+import os.path
+from subprocess import call
+
+def extract_files():
+    """After we have all of our videos split between train and test, and
+    all nested within folders representing their classes, we need to
+    make a data file that we can reference when training our RNN(s).
+    This will let us keep track of image sequences and other parts
+    of the training process.
+
+    We'll first need to extract images from each of the videos. We'll
+    need to record the following data in the file:
+
+    [train|test], class, filename, nb frames
+
+    Extracting can be done with ffmpeg:
+    `ffmpeg -i video.mpg image-%04d.jpg`
+    """
+    data_file = []
+    folders = ['./train/', './test/']
+
+    for folder in folders:
+        class_folders = glob.glob(folder + '*')
+
+        for vid_class in class_folders:
+            class_files = glob.glob(vid_class + '/*.avi')
+
+            for video_path in class_files:
+                # Get the parts of the file.
+                video_parts = get_video_parts(video_path)
+
+                train_or_test, classname, filename_no_ext, filename = video_parts
+
+                # Only extract if we haven't done it yet. Otherwise, just get
+                # the info.
+                if not check_already_extracted(video_parts):
+                    # Now extract it.
+                    src = train_or_test + '/' + classname + '/' + \
+                        filename
+                    dest = train_or_test + '/' + classname + '/' + \
+                        filename_no_ext + '-%04d.jpg'
+                    call(["ffmpeg", "-i", src, dest])
+
+                # Now get how many frames it is.
+                nb_frames = get_nb_frames_for_video(video_parts)
+
+                data_file.append([train_or_test, classname, filename_no_ext, nb_frames])
+
+                print("Generated %d frames for %s" % (nb_frames, filename_no_ext))
+
+    with open('data_file.csv', 'w') as fout:
+        writer = csv.writer(fout)
+        writer.writerows(data_file)
+
+    print("Extracted and wrote %d video files." % (len(data_file)))
+
+def get_nb_frames_for_video(video_parts):
+    """Given video parts of an (assumed) already extracted video, return
+    the number of frames that were extracted."""
+    train_or_test, classname, filename_no_ext, _ = video_parts
+    generated_files = glob.glob(train_or_test + '/' + classname + '/' +
+                                filename_no_ext + '*.jpg')
+    return len(generated_files)
+
+def get_video_parts(video_path):
+    """Given a full path to a video, return its parts."""
+    parts = video_path.split('/')
+    filename = parts[3]
+    filename_no_ext = filename.split('.')[0]
+    classname = parts[2]
+    train_or_test = parts[1]
+
+    return train_or_test, classname, filename_no_ext, filename
+
+def check_already_extracted(video_parts):
+    """Check to see if we created the -0001 frame of this file."""
+    train_or_test, classname, filename_no_ext, _ = video_parts
+    return bool(os.path.exists(train_or_test + '/' + classname +
+                               '/' + filename_no_ext + '-0001.jpg'))
+
+def main():
+    """
+    Extract images from videos and build a new file that we
+    can use as our data input file. It can have format:
+
+    [train|test], class, filename, nb frames
+    """
+    extract_files()
+
+if __name__ == '__main__':
+    main()
--- a/extract_features.py
+++ b/extract_features.py
+"""
+This script generates extracted features for each video, which other
+models make use of.
+
+You can change you sequence length and limit to a set number of classes
+below.
+
+class_limit is an integer that denotes the first N classes you want to
+extract features from. This is useful is you don't want to wait to
+extract all 101 classes. For instance, set class_limit = 8 to just
+extract features for the first 8 (alphabetical) classes in the dataset.
+Then set the same number when training models.
+"""
+import numpy as np
+import os.path
+from data import DataSet
+from extractor import Extractor
+from tqdm import tqdm
+
+# Set defaults.
+seq_length = 40
+class_limit = None  # integer, number of classes to extract
+
+# Get the dataset.
+data = DataSet(seq_length=seq_length, class_limit=class_limit)
+
+# get the model.
+model = Extractor()
+
+# Loop through data.
+pbar = tqdm(total=len(data.data))
+for video in data.data:
+
+    # Get the path to the sequence for this video.
+    path = './data/sequences-ucf/' + video[2] + '-' + str(seq_length) + \
+        '-features.txt'
+
+    # Check if we already have it.
+    if os.path.isfile(path):
+        pbar.update(1)
+        continue
+
+    # Get the frames for this video.
+    frames = data.get_frames_for_sample(video)
+
+    # Now downsample to just the ones we need.
+    frames = data.rescale_list(frames, seq_length)
+
+    # Now loop through and extract features to build the sequence.
+    sequence = []
+    for image in frames:
+        features = model.extract(image)
+        sequence.append(features)
+
+    # Save the sequence.
+    np.savetxt(path, sequence)
+
+    pbar.update(1)
+
+pbar.close()
--- a/extractor.py
+++ b/extractor.py
+from keras.preprocessing import image
+from keras.applications.inception_v3 import InceptionV3, preprocess_input
+from keras.models import Model, load_model
+from keras.layers import Input
+import numpy as np
+
+class Extractor():
+    def __init__(self, weights=None):
+        """Either load pretrained from imagenet, or load our saved
+        weights from our own training."""
+
+        self.weights = weights  # so we can check elsewhere which model
+
+        if weights is None:
+            # Get model with pretrained weights.
+            input_tensor = Input(shape=(299, 299, 3))
+            base_model = InceptionV3(
+                input_tensor=input_tensor,
+                weights='imagenet',
+                include_top=True
+            )
+
+            # We'll extract features at the final pool layer.
+            self.model = Model(
+                input=base_model.input,
+                output=base_model.get_layer('avg_pool').output
+            )
+
+        else:
+            # Load the model first.
+            self.model = load_model(weights)
+
+            # Then remove the top so we get features not predictions.
+            # From: https://github.com/fchollet/keras/issues/2371
+            self.model.layers.pop()
+            self.model.layers.pop()  # two pops to get to pool layer
+            self.model.outputs = [self.model.layers[-1].output]
+            self.model.output_layers = [self.model.layers[-1]]
+            self.model.layers[-1].outbound_nodes = []
+
+    def extract(self, image_path):
+        img = image.load_img(image_path, target_size=(299, 299))
+        x = image.img_to_array(img)
+        x = np.expand_dims(x, axis=0)
+        x = preprocess_input(x)
+
+        # Get the prediction.
+        features = self.model.predict(x)
+
+        if self.weights is None:
+            # For imagenet/default network:
+            features = features[0][0][0]
+        else:
+            # For loaded network:
+            features = features[0]
+
+        return features
--- a/models.py
+++ b/models.py
+"""
+A collection of models we'll use to attempt to classify videos.
+"""
+from keras.layers import Dense, Flatten, Dropout
+from keras.layers.recurrent import LSTM, GRU
+from keras.models import Sequential, load_model
+from keras.optimizers import Adam
+from keras.layers.wrappers import TimeDistributed
+from keras.layers.convolutional import Convolution2D, MaxPooling3D, Convolution3D
+from collections import deque
+
+class ResearchModels():
+    def __init__(self, nb_classes, model, seq_length,
+                 saved_model=None, features_length=2048):
+        """
+        `model` = one of:
+            lstm
+            crnn
+            mlp
+            conv_3d
+        `nb_classes` = the number of classes to predict
+        `seq_length` = the length of our video sequences
+        `saved_model` = the path to a saved Keras model to load
+        """
+
+        # Set defaults.
+        self.seq_length = seq_length
+        self.load_model = load_model
+        self.saved_model = saved_model
+        self.nb_classes = nb_classes
+        self.feature_queue = deque()
+
+        # Set the metrics. Only use top k if there's a need.
+        metrics = ['accuracy']
+        if self.nb_classes >= 10:
+            metrics.append('top_k_categorical_accuracy')
+
+        # Get the appropriate model.
+        if self.saved_model is not None:
+            print("Loading model %s" % self.saved_model)
+            self.model = load_model(self.saved_model)
+        elif model == 'lstm':
+            print("Loading Deep LSTM model.")
+            self.input_shape = (seq_length, features_length)
+            self.model = self.lstm()
+        elif model == 'crnn':
+            print("Loading CRNN model.")
+            self.input_shape = (seq_length, 224, 224, 3)
+            self.model = self.crnn()
+        elif model == 'mlp':
+            print("Loading simple MLP.")
+            self.input_shape = features_length * seq_length
+            self.model = self.mlp()
+        elif model == 'conv_3d':
+            print("Loading Conv3D")
+            self.input_shape = (seq_length, 112, 112, 3)
+            self.model = self.conv_3d()
+        else:
+            print("Unknown network.")
+            sys.exit()
+
+        # Now compile the network.
+        optimizer = Adam(lr=1e-6)  # aggressively small learning rate
+        self.model.compile(loss='categorical_crossentropy', optimizer=optimizer,
+                           metrics=metrics)
+
+    def lstm(self):
+        """Build a simple LSTM network. We pass the extracted features from
+        our CNN to this model predomenently."""
+        # Model.
+        model = Sequential()
+        model.add(LSTM(128, return_sequences=True, input_shape=self.input_shape,
+                       dropout_W=0.4, dropout_U=0.4))
+        model.add(LSTM(128, return_sequences=True, dropout_W=0.4, dropout_U=0.4))
+        model.add(LSTM(128, return_sequences=True, dropout_W=0.4, dropout_U=0.4))
+        model.add(Flatten())
+        model.add(Dense(256, activation='relu'))
+        model.add(Dropout(0.2))
+        model.add(Dense(256, activation='relu'))
+        model.add(Dropout(0.2))
+        model.add(Dense(self.nb_classes, activation='softmax'))
+
+        return model
+
+    def crnn(self):
+        """Build a CNN into RNN.
+        Starting version from:
+        https://github.com/udacity/self-driving-car/blob/master/
+            steering-models/community-models/chauffeur/models.py
+        """
+        model = Sequential()
+        model.add(TimeDistributed(Convolution2D(24, 5, 5,
+            init="he_normal",
+            activation='relu',
+            subsample=(5, 4),
+            border_mode='valid'), input_shape=self.input_shape))
+        model.add(TimeDistributed(Convolution2D(32, 5, 5,
+            init="he_normal",
+            activation='relu',
+            subsample=(3, 2),
+            border_mode='valid')))
+        model.add(TimeDistributed(Convolution2D(48, 3, 3,
+            init="he_normal",
+            activation='relu',
+            subsample=(1, 2),
+            border_mode='valid')))
+        model.add(TimeDistributed(Convolution2D(64, 3, 3,
+            init="he_normal",
+            activation='relu',
+            border_mode='valid')))
+        model.add(TimeDistributed(Convolution2D(128, 3, 3,
+            init="he_normal",
+            activation='relu',
+            subsample=(1, 2),
+            border_mode='valid')))
+        model.add(TimeDistributed(Flatten()))
+        model.add(Dropout(0.2))
+        model.add(GRU(128, return_sequences=True))
+        model.add(Dropout(0.2))
+        model.add(GRU(128, return_sequences=True))
+        model.add(Dropout(0.2))
+        model.add(GRU(128, return_sequences=True))
+        model.add(Flatten())
+        model.add(Dense(256))
+        model.add(Dropout(0.2))
+        model.add(Dense(self.nb_classes, activation='softmax'))
+
+        return model
+
+    def mlp(self):
+        """Build a simple MLP."""
+        # Model.
+        model = Sequential()
+        model.add(Dense(512, input_dim=self.input_shape))
+        model.add(Dropout(0.4))
+        model.add(Dense(512))
+        model.add(Dropout(0.4))
+        model.add(Dense(512))
+        model.add(Dropout(0.4))
+        model.add(Dense(512))
+        model.add(Dropout(0.4))
+        model.add(Dense(512))
+        model.add(Dropout(0.4))
+        model.add(Dense(self.nb_classes, activation='softmax'))
+
+        return model
+
+    def conv_3d(self):
+        """
+        Build a 3D convolutional network, C3D.
+
+        Based on the paper:
+            https://arxiv.org/pdf/1412.0767.pdf
+
+        As implemented in Keras by:
+            https://gist.github.com/albertomontesg/d8b21a179c1e6cca0480ebdf292c34d2
+
+        Note that this requires a lot of memory to run.
+        """
+        model = Sequential()
+        # 1st layer group
+        model.add(Convolution3D(64, 3, 3, 3, activation='relu', 
+                                border_mode='same', name='conv1',
+                                subsample=(1, 1, 1), 
+                                input_shape=self.input_shape))
+        model.add(MaxPooling3D(pool_size=(1, 2, 2), strides=(1, 2, 2), 
+                               border_mode='valid', name='pool1'))
+        # 2nd layer group
+        model.add(Convolution3D(128, 3, 3, 3, activation='relu',
+                                border_mode='same', name='conv2',
+                                subsample=(1, 1, 1)))
+        model.add(MaxPooling3D(pool_size=(2, 2, 2), strides=(2, 2, 2), 
+                               border_mode='valid', name='pool2'))
+        # 3rd layer group
+        model.add(Convolution3D(256, 3, 3, 3, activation='relu',
+                                border_mode='same', name='conv3a',
+                                subsample=(1, 1, 1)))
+        model.add(Convolution3D(256, 3, 3, 3, activation='relu', 
+                                border_mode='same', name='conv3b',
+                                subsample=(1, 1, 1)))
+        model.add(MaxPooling3D(pool_size=(2, 2, 2), strides=(2, 2, 2), 
+                               border_mode='valid', name='pool3'))
+        # 4th layer group
+        model.add(Convolution3D(512, 3, 3, 3, activation='relu', 
+                                border_mode='same', name='conv4a',
+                                subsample=(1, 1, 1)))
+        model.add(Convolution3D(512, 3, 3, 3, activation='relu', 
+                                border_mode='same', name='conv4b',
+                                subsample=(1, 1, 1)))
+        model.add(MaxPooling3D(pool_size=(2, 2, 2), strides=(2, 2, 2), 
+                               border_mode='valid', name='pool4'))
+        # 5th layer group
+        model.add(Convolution3D(512, 3, 3, 3, activation='relu', 
+                                border_mode='same', name='conv5a',
+                                subsample=(1, 1, 1)))
+        model.add(Convolution3D(512, 3, 3, 3, activation='relu', 
+                                border_mode='same', name='conv5b',
+                                subsample=(1, 1, 1)))
+        model.add(ZeroPadding3D(padding=(0, 1, 1)))
+        model.add(MaxPooling3D(pool_size=(2, 2, 2), strides=(2, 2, 2), 
+                               border_mode='valid', name='pool5'))
+        model.add(Flatten())
+        # FC layers group
+        model.add(Dense(4096, activation='relu', name='fc6'))
+        model.add(Dropout(.5))
+        model.add(Dense(4096, activation='relu', name='fc7'))
+        model.add(Dropout(.5))
+        model.add(Dense(487, activation='softmax', name='fc8'))
+        return model
+
--- a/plot_trainlog.py
+++ b/plot_trainlog.py
+"""
+Given a training log file, plot something.
+"""
+import csv
+import matplotlib.pyplot as plt
+
+def main(training_log):
+    with open(training_log) as fin:
+        reader = csv.reader(fin)
+        next(reader, None)  # skip the header
+        accuracies = []
+        top_5_accuracies = []
+        cnn_benchmark = []  # this is ridiculous
+        for epoch,acc,loss,top_k_categorical_accuracy,val_acc,val_loss,val_top_k_categorical_accuracy in reader:
+            accuracies.append(float(val_acc))
+            top_5_accuracies.append(float(val_top_k_categorical_accuracy))
+            cnn_benchmark.append(0.65)  # ridiculous
+
+        plt.plot(accuracies)
+        plt.plot(top_5_accuracies)
+        plt.plot(cnn_benchmark)
+        plt.show()
+
+if __name__ == '__main__':
+    training_log = 'data/logs/mlp-training-1489455559.7089438.log'
+    main(training_log)
--- a/processor.py
+++ b/processor.py
+"""
+Process an image that we can pass to our networks.
+"""
+from keras.preprocessing.image import img_to_array, load_img
+import numpy as np
+
+def process_image(image, target_shape):
+    """Given an image, process it and return the array."""
+    # Load the image.
+    h, w, _ = target_shape
+    image = load_img(image, target_size=(h, w))
+
+    # Turn it into numpy, normalize and return.
+    img_arr = img_to_array(image)
+    x = (img_arr / 255.).astype(np.float32)
+
+    return x
--- a/random_and_mode.py
+++ b/random_and_mode.py
+"""
+Try to "classify" samples based on random chance and always guessing
+the most popular category.
+"""
+import random
+from data import DataSet
+
+most_pop = 'TennisSwing'
+
+data = DataSet()
+nb_classes = len(data.classes)
+
+# Try a random guess.
+nb_random_matched = 0
+nb_mode_matched = 0
+for item in data.data:
+    choice = random.choice(data.classes)
+    actual = item[1]
+
+    if choice == actual:
+        nb_random_matched += 1
+
+    if actual == most_pop:
+        nb_mode_matched += 1
+
+random_accuracy = nb_random_matched / len(data.data)
+mode_accuracy = nb_mode_matched / len(data.data)
+print("Randomly matched %.2f%%" % (random_accuracy * 100))
+print("Mode matched %.2f%%" % (mode_accuracy * 100))
--- a/train.py
+++ b/train.py
+"""
+Train our RNN on bottlecap or prediction files generated from our CNN.
+"""
+from keras.callbacks import TensorBoard, ModelCheckpoint, EarlyStopping, CSVLogger
+from models import ResearchModels
+from data import DataSet
+import time
+
+def train(data_type, seq_length, model, saved_model=None,
+          concat=False, class_limit=None, image_shape=None):
+    # Set variables.
+    nb_epoch = 1000
+    batch_size = 32
+
+    # Helper: Save the model.
+    checkpointer = ModelCheckpoint(
+        filepath='./data/checkpoints/' + model + '-' + data_type + \
+            '.{epoch:03d}-{val_loss:.3f}.hdf5',
+        verbose=1,
+        save_best_only=True)
+
+    # Helper: TensorBoard
+    tb = TensorBoard(log_dir='./data/logs')
+
+    # Helper: Stop when we stop learning.
+    early_stopper = EarlyStopping(patience=10)
+
+    # Helper: Save results.
+    timestamp = time.time()
+    csv_logger = CSVLogger('./data/logs/' + model + '-' + 'training-' + \
+        str(timestamp) + '.log')
+
+    # Get the data and process it.
+    if image_shape is None:
+        data = DataSet(
+            seq_length=seq_length,
+            class_limit=class_limit
+        )
+    else:
+        data = DataSet(
+            seq_length=seq_length,
+            class_limit=class_limit,
+            image_shape=image_shape
+        )
+
+    # Get samples per epoch.
+    # Multiply by 0.7 to attempt to guess how much of data.data is the train set.
+    samples_per_epoch = ((len(data.data) * 0.7) // batch_size) * batch_size
+
+    # Get generators.
+    generator = data.frame_generator(batch_size, 'train', data_type, concat)
+    val_generator = data.frame_generator(batch_size, 'test', data_type, concat)
+
+    # Get the model.
+    rm = ResearchModels(len(data.classes), model, seq_length, saved_model)
+
+    # Fit!
+    rm.model.fit_generator(
+        generator=generator,
+        samples_per_epoch=samples_per_epoch,
+        nb_epoch=nb_epoch,
+        verbose=1,
+        callbacks=[checkpointer, tb, early_stopper, csv_logger],
+        validation_data=val_generator,
+        nb_val_samples=256)
+
+def main():
+    """These are the main training settings. Set each before running
+    this file."""
+    data_type = 'features'  # can be 'features' or 'images'
+    seq_length = 40
+    model = 'lstm'  # see `models.py` for more
+    class_limit = None  # int, can be 1-101 or None
+    saved_model = None  # None or weights file
+    concat = False  # true for MLP only
+    image_shape = None  # Use None for default or specify a new size
+
+    train(data_type, seq_length, model, saved_model=saved_model,
+          class_limit=class_limit, concat=concat, image_shape=image_shape)
+
+if __name__ == '__main__':
+    main()
--- a/train_cnn.py
+++ b/train_cnn.py
+"""
+Train on images split into directories. This assumes we've split
+our videos into frames and moved them to their respective folders.
+
+Based on:
+https://keras.io/preprocessing/image/
+and
+https://keras.io/applications/
+"""
+from keras.applications.inception_v3 import InceptionV3
+from keras.optimizers import SGD
+from keras.preprocessing.image import ImageDataGenerator
+from keras.models import Model
+from keras.layers import Dense, GlobalAveragePooling2D
+from keras.callbacks import ModelCheckpoint, TensorBoard, EarlyStopping
+from data import DataSet
+
+data = DataSet()
+
+# Helper: Save the model.
+checkpointer = ModelCheckpoint(
+    filepath='./data/checkpoints/inception.{epoch:03d}-{val_loss:.2f}.hdf5',
+    verbose=1,
+    save_best_only=True)
+
+# Helper: Stop when we stop learning.
+early_stopper = EarlyStopping(patience=10)
+
+# Helper: TensorBoard
+tensorboard = TensorBoard(log_dir='./data/logs/')
+
+def get_generators():
+    train_datagen = ImageDataGenerator(
+        rescale=1./255,
+        shear_range=0.2,
+        horizontal_flip=True,
+        rotation_range=10.,
+        width_shift_range=0.2,
+        height_shift_range=0.2)
+
+    test_datagen = ImageDataGenerator(rescale=1./255)
+
+    train_generator = train_datagen.flow_from_directory(
+        './data/train/',
+        target_size=(299, 299),
+        batch_size=32,
+        classes=data.classes,
+        class_mode='categorical')
+
+    validation_generator = test_datagen.flow_from_directory(
+        './data/test/',
+        target_size=(299, 299),
+        batch_size=32,
+        classes=data.classes,
+        class_mode='categorical')
+
+    return train_generator, validation_generator
+
+def get_model(weights='imagenet'):
+    # create the base pre-trained model
+    base_model = InceptionV3(weights=weights, include_top=False)
+
+    # add a global spatial average pooling layer
+    x = base_model.output
+    x = GlobalAveragePooling2D()(x)
+    # let's add a fully-connected layer
+    x = Dense(1024, activation='relu')(x)
+    # and a logistic layer -- let's say we have 2 classes
+    predictions = Dense(len(data.classes), activation='softmax')(x)
+
+    # this is the model we will train
+    model = Model(input=base_model.input, output=predictions)
+    return model
+
+def get_top_layer_model(base_model):
+    """Used to train just the top layers of the model."""
+    # first: train only the top layers (which were randomly initialized)
+    # i.e. freeze all convolutional InceptionV3 layers
+    for layer in base_model.layers:
+        layer.trainable = False
+
+    # compile the model (should be done *after* setting layers to non-trainable)
+    base_model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
+
+    return base_model
+
+def get_mid_layer_model(model):
+    """After we fine-tune the dense layers, train deeper."""
+    # we chose to train the top 2 inception blocks, i.e. we will freeze
+    # the first 172 layers and unfreeze the rest:
+    for layer in model.layers[:172]:
+        layer.trainable = False
+    for layer in model.layers[172:]:
+        layer.trainable = True
+
+    # we need to recompile the model for these modifications to take effect
+    # we use SGD with a low learning rate
+    model.compile(
+        optimizer=SGD(lr=0.0001, momentum=0.9),
+        loss='categorical_crossentropy',
+        metrics=['accuracy', 'top_k_categorical_accuracy'])
+
+    return model
+
+def train_model(model, nb_epoch, generators, callbacks=[]):
+    train_generator, validation_generator = generators
+    model.fit_generator(
+        train_generator,
+        samples_per_epoch=128,
+        validation_data=validation_generator,
+        nb_val_samples=768,
+        nb_epoch=nb_epoch,
+        callbacks=callbacks)
+    return model
+
+def main(weights_file):
+    model = get_model()
+    generators = get_generators()
+
+    if weights_file is None:
+        print("Loading network from ImageNet weights.")
+        # Get and train the top layers.
+        model = get_top_layer_model(model)
+        model = train_model(model, 10, generators)
+    else:
+        print("Loading saved model: %s." % weights_file)
+        model.load_weights(weights_file)
+
+    # Get and train the mid layers.
+    model = get_mid_layer_model(model)
+    model = train_model(model, 1000, generators,
+                        [checkpointer, early_stopper, tensorboard])
+
+if __name__ == '__main__':
+    weights_file = None
+    main(weights_file)
--- a/validate_cnn.py
+++ b/validate_cnn.py
+"""
+Classify a few images through our CNN.
+"""
+import numpy as np
+import operator
+import random
+import glob
+from data import DataSet
+from processor import process_image
+from keras.models import load_model
+
+def main(nb_images=5):
+    """Spot-check `nb_images` images."""
+    data = DataSet()
+    model = load_model('data/checkpoints/inception.057-1.16.hdf5')
+
+    # Get all our test images.
+    images = glob.glob('./data/test/**/*.jpg')
+
+    for _ in range(nb_images):
+        print('-'*80)
+        # Get a random row.
+        sample = random.randint(0, len(images) - 1)
+        image = images[sample]
+
+        # Turn the image into an array.
+        print(image)
+        image_arr = process_image(image, (299, 299, 3))
+        image_arr = np.expand_dims(image_arr, axis=0)
+
+        # Predict.
+        predictions = model.predict(image_arr)
+
+        # Show how much we think it's each one.
+        label_predictions = {}
+        for i, label in enumerate(data.classes):
+            label_predictions[label] = predictions[0][i]
+
+        sorted_lps = sorted(label_predictions.items(), key=operator.itemgetter(1), reverse=True)
+        
+        for i, class_prediction in enumerate(sorted_lps):
+            # Just get the top five.
+            if i > 4:
+                break
+            print("%s: %.2f" % (class_prediction[0], class_prediction[1]))
+            i += 1
+
+if __name__ == '__main__':
+    main()