final_doc.py

# UW NetID (Nanda Sundaresan): nandas
# UW NetID (Vineeth Sai Narajala): vineeth7
# CSE 160
# Homework 7: Final project

import csv
from operator import itemgetter
import matplotlib.pyplot as plt
import pylab
import pandas as pd
import seaborn as sns

plt.rcParams["figure.figsize"] = (16, 8)
pylab.rcParams["figure.figsize"] = (16, 8)


def extract_as_list(filename):
    """Opens file, appends each row (as dictionary) into list.
    Parameters:
        filename as string.
    Returns:
        output: list of each row in csv file as a dictionary.
    """

    output = list()
    csv_file = open(filename)
    for row in csv.DictReader(csv_file):
        output.append(row)
    csv_file.close()
    return output


def extract_as_dataframe(filename):
    """Reads in the data of the csv file into DataFrame.
    Parameters:
        filename: filename as string.
    Returns:
        data: DataFrame of csv file.
    """

    data = pd.read_csv(filename, low_memory=False)
    return data


def extract_data_for_months_by_year(data_list):
    """Extracts incidence per month for every year as a dictionary of dictionaries.
    For example, data points for the months of year 1980 would look something like this:
        {1980: {"January" : 18983, "February" : 23213, "March" : 31242 ... etc.}}
    Parameters:
        data_list: list of dictionaries, each item in list is a row of the CSV file.
    Returns:
        output_dict: dictionary mapping year to incidences per month of that year.
    """

    output_dict = dict()

    for row in data_list:
        if row["Year"] in output_dict.keys():
            year_dict = output_dict[row["Year"]]
            if row["Month"] in year_dict.keys():
                output_dict[row["Year"]][row["Month"]] += int(row["Incident"])
            else:
                output_dict[row["Year"]][row["Month"]] = int(row["Incident"])
        else:
            output_dict[row["Year"]] = dict()

    assert (len(output_dict[row["Year"]].keys()) == 12), "Invalid number of months in data file."

    return output_dict


def extract_data_by_categories(data_list, column_names, exclude_points):
    """ Categorizes data points based on which column you want to look at.
    Parameters:
        data_list: list of dictionaries, each item in list is a row of the CSV file.
        column_names: list of column names as strings that you want to investigate.
        exclude_points: list of values as strings that are invalid data points such as "Unknown".
    Returns:
        output_dict: dictionary mapping each column to it's valid data points.
    """

    output_dict = dict()
    # Reading in rows of file, excluding points that are invalid or "placeholder" points
    for row in data_list:
        for name in column_names:
            if row[name] not in exclude_points:
                if name in output_dict.keys():
                    output_dict[name].append(row[name])
                else:
                    output_dict[name] = [row[name]]

    return output_dict


def find_mode_of_category(data_dict, column_name):
    """ Given a specific column name, finds the most common data point (mode) and gives
    its frequency.
    Parameters:
        data_dict: dictionary of selected column names as keys with their data points as values.
        column_name: specific column name to find mode of.
    Returns:
        max_freq_datum: tuple consisting of: (data point that is the mode, frequency of that
        data point).
    """

    data = data_dict[column_name]

    # Finding counts of each data point in column
    number_dict = dict()
    for datum in data:
        if datum in number_dict.keys():
            number_dict[datum] += 1
        else:
            number_dict[datum] = 1

    # Converting counts into percentage of total data points (frequency)
    num_of_points = sum(number_dict.values())
    for val in number_dict:
        number_dict[val] = float(number_dict[val]) / num_of_points

    # Sorting data from max frequency to min frequency, first datum is max
    max_to_min = sorted(number_dict.items(), key = itemgetter(1), reverse = True)
    max_freq_datum = max_to_min[0]

    return max_freq_datum


def find_max_of_all(data_dict, column_names):
    """ Given multiple column names, finds the modes for each column and their frequencies.
    Parameters:
        data_dict: dictionary of selected column names as keys with their data points as values.
        column_names: list of column names as strings that you want to investigate.
    Returns:
        output: list of tuples, each item in list as a return value from find_mode_of_category.
    """

    output = list()
    for name in column_names:
        max_of_category = find_mode_of_category(data_dict, name)
        output.append(max_of_category)

    return output


def print_max_values(all_maxes, column_names):
    """Formats list returned from find_max_of_all for printing.
    Parameters:
        all_maxes: list returned from find_max_of_all, list of tuples as (mode datum, frequency).
        column_names: list of column names as strings that you want to investigate.
    Returns:
        None, prints results from finding mode of the data.
    """

    for val in range(len(column_names)):
        category = column_names[val]
        max_data = all_maxes[val][0]
        percent = format(float(all_maxes[val][1]) * 100, '.2f')
        print "Most affected %s: %s, %s" % (category, max_data, percent) + "%"


def extract_data_by_state(data_list, column_names, exclude_points, state_name):
    """Gets most modes for chosen columns for chosen state.
    Parameters:
        data_list: list of dictionaries, each item in list is a row of the CSV file.
        column_names: list of column names as strings that you want to investigate.
        exclude_points: list of values as strings that are invalid data points such as "Unknown".
        state_name: state name as string that values are excluded to.
    Returns:
        modes_for_state: list of tuples as returned from find_max_of_all.
    """

    state_list = list()

    # Accumulate relevant rows of data_list for each state to be used to calculate mode
    for row in data_list:
        if row["State"] == state_name:
            state_list.append(row)

    accum_data = extract_data_by_categories(state_list, column_names, exclude_points)
    modes_for_state = find_max_of_all(accum_data, column_names)

    return modes_for_state


def print_state_data(data_list, column_names, exclude_points, states):
    """Formats calculated modes for printing.
    Parameters:
        data_list: list of dictionaries, each item in list is a row of the CSV file.
        column_names: list of column names as strings that you want to investigate.
        exclude_points: list of values as strings that are invalid data points such as "Unknown".
        states: list of states to calculate modes for.
    Returns:
        None, prints values for each state chosen.
    """

    for state in states:
        state_modes = extract_data_by_state(data_list, column_names, exclude_points, state)
        print "For the state of %s:" % (state)
        print_max_values(state_modes, column_names)
        print


def get_user_states(dict_states):
    """Asks user what states they would like to look at, makes sure that it is a valid state.
    Parameters:
        data_dict: dictionary of states mapping to their abbreviations.
    Returns:
        state_list: list of states user selected.
    """

    print "Please capitalize the first letter of the state name."
    state1 = str(raw_input('What is the first state you would like to look at? '))
    state2 = str(raw_input('What is the second state you would like to look at? '))
    print

    state_list = [state1, state2]

    for state in state_list:
        assert (state in dict_states.keys()), "Please type a valid state name!"

    return state_list


def spike_check_visual(year_data):
    """Orders months in chronological order and checks for spikes in total incidents per month.
    Parameters:
        year_data: dictionary of dictionaries, each year mapping to dictionary of months mapping 
        to total incidents.
    Returns:
        None, prints statements.
    """
    year_set = set()
    diff_dict = dict()

    for input_y in range(1980, 2015):
        input_year = str(input_y)
        month_data = year_data[input_year].items()
        x_val = [None] * 12
        y_val = [None] * 12

        # Order the months in chronological order so when graphing, x-axis does not display randomly
        month = ["January", "February", "March", "April", "May", "June", "July", "August", "September",
                 "October", "November", "December"]
        for items in month_data:
            for i in range(12):
                if items[0] == month[i]:
                    x_val[i] = items[0]
                    y_val[i] = items[1]

        # Assigning each difference value with what to print if it becomes one of the three largest spikes.
        for i in range(0, 11):
            # A spike is defined as an increase in 150% or more over the time period of one month.
            if (y_val[i] * 1.5) <= (y_val[i + 1]):
                year_set.add(input_year)
                diff = y_val[i + 1] - y_val[i]
                diff_dict[diff] = "%s to %s %s" % (x_val[i], x_val[i + 1], input_year)

        # Graphing only the years in which there was a spike, for visual confirmation.
        if input_year in year_set:
            graph_spike_year(x_val, y_val, input_year)

    top_three_diff = sorted(diff_dict.keys(), reverse = True)[0:3]
    print "Three largest spikes in total monthly incidents from 1980 to 2014:"
    for diff in top_three_diff:
        print diff_dict[diff]
    print


def graph_spike_year(x_val, y_val, input_year):
    """Graphs each year which has a spike.
    Parameters:
        x_val: list of months.
        y_val: list of total incidents for cooresponding month.
        input_year: string that is the year which we are to graph.
    Returns:
        None, saves graph.
    """

    pylab.figure(1)
    x = range(12)
    pylab.xticks(x, x_val)
    pylab.plot(x, y_val, "g")
    pylab.title("Number of Incidents per Month for " + input_year)
    pylab.ylabel("Number of Incidents")
    pylab.xlabel("Months")
    pylab.savefig("yearly\incidents_" + str(input_year) + ".png")
    pylab.clf()


def graph_affected_ages(age_clean_data):
    """Graphs plot showing number of incidents for different victim ages.
    Parameters:
        age_clean_data: cleaned DataFrame where age is not 998 or 99 (void values)
    Returns:
        None, saves graph.
    """

    age_clean_data["Victim Age"].value_counts().sort_index(ascending = True).plot(kind = "bar",
                                                                                  color = "purple")
    plt.title("Number of Incidents for Victim Ages")
    plt.xlabel("Ages")
    plt.ylabel("Number of Incidents")
    plt.savefig("graphs\\victim_age.png")
    plt.clf()


def graph_affected_sexes(data_frame):
    """Graphs plot showing number of incidents for different victim sexes.
    Parameters:
        data_frame: DataFrame of csv file.
    Returns:
        None, saves graph.
    """

    data_frame["Victim Sex"].value_counts().plot(kind = 'bar')
    plt.title("Number of Incidents for Victim Sexes")
    plt.ylabel("Number of Incidents")
    plt.savefig("graphs\\number_hom_sex.png")
    plt.clf()

def graph_unsolved_cases_per_year(data_frame):
    """Graphs plot showing number of unsolved cases from 1980 to 2014.
    Parameters:
        data_frame: DataFrame of csv file.
    Returns:
        None, saves graph.
    """
    
    unsolved = data_frame[data_frame["Crime Solved"] != "Yes"]
    unsolved['Year'].value_counts().sort_index(ascending = True).plot(kind = 'line', color = "Red")
    plt.title('Number of Unsolved Homicides: 1980 to 2014')
    plt.savefig("graphs\unsolved_hom.png")
    plt.clf()


def graph_weapons_handgun_over_time(data_frame):
    """Graphs number of cases with weapon documented as "handgun". Graphs handgun use over time.
    Parameters:
        data_frame: DataFrame of csv file.
    Returns:
        None, saves graph.
    """

    ax2 = sns.countplot(x = "Year", hue = "Weapon", data = data_frame[data_frame["Weapon"] == "Handgun"],
                        palette = "colorblind")
    ax2.legend(loc='upper right')
    plt.title("Use of Handguns over Time")
    plt.xlabel("Years")
    plt.ylabel("Number of Homicides using Handguns")
    plt.savefig("graphs\handgun_time.png")
    plt.clf()


def find_incidents_for_states(data_list, dict_states):
    """Finds total number of incidents per state.
    Parameters:
        data_list: list of dictionaries, each item in list is a row of the CSV file.
        dict_states: dictionary mapping each state name to it's abbreviation. 
    Returns:
        final_dict: dictionary mapping each state to it's total number of incidents.
    """

    final_dict = dict()

    for row in data_list:
        if dict_states[row["State"]] in final_dict.keys():
            final_dict[dict_states[row["State"]]] += int(row["Incident"])
        else:
            final_dict[dict_states[row["State"]]] = int(row["Incident"])
    
    return final_dict
    
    
def print_high_and_low_incidence(final_dict): 
    """Prints state with highest total incidents and lowest. 
    Parameters: 
        final_dict: dictionary mapping each state to it's total incident count.
    Returns:
        None, prints. 
    """
    
    greatest_to_least = sorted(final_dict.items(), key = itemgetter(1), reverse = True)
    print "States with highest and lowest total incidents:"
    print "Highest: %s, Total Incidents: %d" %(greatest_to_least[0][0], greatest_to_least[0][1])
    print "Lowest: %s, Total Incidents: %d" %(greatest_to_least[-1][0], greatest_to_least[-1][1])
    print

def graph_crime_state(state_dict):
    """Graphs total incidents for each state.
    Parameters:
        state_dict: dictionary of states mapping to their abbreviation.
    Returns:
        None, saves graph.
    """
    
    x_val = list()
    y_val = list()

    state_tup = state_dict.items()
    for state in state_tup:
        x_val.append(state[0])
        y_val.append(state[1])

    x = range(len(x_val))
    pylab.xticks(x, x_val)
    plt.xticks(rotation = 90)
    pylab.bar(x, y_val, color='r')
    pylab.title("Number of Incidents for all States")
    pylab.ylabel("Number of Incidents")
    pylab.xlabel("state")
    pylab.savefig("graphs\incidents_state.png")
    pylab.clf()


def main():
    """ Will run main program when final_doc.py is run """

    print "Welcome to Nanda and Vineeth's data analysis for all US homicides from 1980 - 2014"
    print "Our data set has more than 600,000 data points, so please bear with us if the program is slow\n"

    filename = "crime_data.csv"
    column_names = ["Victim Sex", "Victim Age", "Victim Race", "Relationship"]
    exclude_points = ["Unknown", "0", "998"]

    dict_states = {
        "Alaska": "AK", "Alabama": "AL", "Arkansas": "AR", "Arizona": "AZ", "California": "CA", 
        "Colorado": "CO","Connecticut": "CT", "District of Columbia": "DC", "Delaware": "DE", 
        "Florida": "FL", "Georgia": "GA","Hawaii": "HI", "Iowa": "IA", "Idaho": "ID", "Illinois": "IL", 
        "Indiana": "IN", "Kansas": "KS", "Kentucky": "KY", "Louisiana": "LA", "Massachusetts": "MA", 
        "Maryland": "MD", "Maine": "ME", "Michigan": "MI", "Minnesota": "MN", "Missouri": "MO", 
        "Mississippi": "MS", "Montana": "MT", "North Carolina": "NC", "North Dakota": "ND", 
        "Nebraska": "NE", "New Hampshire": "NH", "New Jersey": "NJ", "New Mexico": "NM","Nevada": "NV", 
        "New York": "NY", "Ohio": "OH", "Oklahoma": "OK", "Oregon": "OR", "Pennsylvania": "PA",
        "Puerto Rico": "PR", "Rhodes Island": "RI", "South Carolina": "SC", "South Dakota": "SD", 
        "Tennessee": "TN", "Texas": "TX", "Utah": "UT", "Virginia": "VA", "Vermont": "VT", 
        "Washington": "WA", "Wisconsin": "WI", "West Virginia": "WV", "Wyoming": "WY"
    }

    # Extract data from CSV file
    data_list = extract_as_list(filename)

    # Find all data points for victim sex, race, age, and relationship
    data_dict = extract_data_by_categories(data_list, column_names, exclude_points)
    
    # Compare two state modes based on user input
    answer = str(raw_input("Would you like to compare modes for two states? (yes/no) "))
    assert (answer == "yes" or answer == "no"), "Please type a valid answer!"
    
    print
    if answer.lower() == "yes":
        states = get_user_states(dict_states)
        print_state_data(data_list, column_names, exclude_points, states)
    else:
        print "Please wait for the remaining computations!\n"

    # Find most common datum for victim sex, race, age, and relationship
    all_maxes = find_max_of_all(data_dict, column_names)
    print "For all states:"
    print_max_values(all_maxes, column_names)
    print
    
    print "Please wait, the program is graphing the results\n"

    print "Plotting years with spikes...\n"
    # Find incidence rate per month for every year
    year_data = extract_data_for_months_by_year(data_list)
    spike_check_visual(year_data)

    # Extract data from CSV file as DataFrame
    data_frame = extract_as_dataframe(filename)
    
    print "Plotting total incidents for all states..."
    # Graph all incidents for all states
    incidents_dict = find_incidents_for_states(data_list, dict_states)
    graph_crime_state(incidents_dict)
    print "Done\n"
    
    print_high_and_low_incidence(incidents_dict)

    print "Plotting affected victim ages..."
    # Graph affected victim ages
    age_clean_data = data_frame[data_frame["Victim Age"] != 998]
    age_clean_data = age_clean_data[age_clean_data["Victim Age"] != 99]
    graph_affected_ages(age_clean_data)
    print "Done\n"
    
    print "Plotting use of handguns over time..."
    # Graph use of handguns over time
    graph_weapons_handgun_over_time(data_frame)
    print "Done\n"

    print "Plotting affected victim sex based on frequency..."
    # Graph affected victim sex based on frequency
    graph_affected_sexes(data_frame)
    print "Done\n"
    
    print "Plotting number of unsolved cases..."
    # Graph number of unsolved cases from 1980 to 2014
    graph_unsolved_cases_per_year(data_frame)
    print "Done\n"
    
    print "Check the folder 'yearly' and 'graphs' for the plots\n"
    print "Program complete."
    

if __name__ == "__main__":
    main()