Source code for datatools.nbddataframe

import pandas as pd
from sqlalchemy import create_engine, MetaData
from StringIO import StringIO
from mpl_toolkits.basemap import Basemap
import matplotlib.pyplot as plt

pd.options.display.mpl_style = 'default'


#data string for testing purposes
testdata=\
"""\
val	lat	lon	rand	neighborhood	date
4.0764441201	47.600025185	-122.373927686	0.1276587849	B	1/1/2000
4.2520509753	47.4	-122.3413044916	0.8755922036	A	6/20/1986
1.3226973685	47.634555916	-122.3442923659	0.3654616242	B	10/13/1987
-9.3627561629	47.6509594572	-122.2993523445	0.0957500809	B	1/1/2000
	47.5445857933	-122.3436249441	0.1716343011	B	1/1/2000
-5.911734812	47.5112812526		0.0838911855	B	1/1/2000
4.7337248223	47.6178126252	-122.3997977501	0.9184924145	A	6/20/1986
-9.5657038502	47.5267191329	-122.1	0.2555611331	B	1/1/2000
-6.5947027411	47.72339967	-122.3703101539	0.3693092365	B	10/13/1987
3.5680261068	47.7431436405	-122.3081758471	0.6814667757	A	6/20/1986
-0.4510013713	47.7208316866	-122.2663515676	0.9224957069	A	6/20/1986
	47.5186485248	-122.3297974329	0.4966851496	B	10/13/1987
-6.0590357706	47.5794698747	-122.4396772129	0.5864697497	A	10/13/1987
"""

#lat long bounds
minlat = 47.5
"""The default minimum latitude."""

maxlat = 47.75
"""The default maximum latitude."""

minlong = -122.44
"""The default minimum longitude."""

maxlong = -122.2
"""The default maximum longitude."""

[docs]def rename_cols(df, nbdname=None, latname='latitude',
                longname='longitude', datename='date'):
    """
    Rename the longitude, latitude, date (and convert to time type)
    and nbd (if there is one) columns to make the DataFrame *df* 
    compatible with :class:`NBDDataFrame`.
    
    Parameters:
    ___________
    
    :param str nbdname: 
        (optional) the name of the neighborhood column
        if there is one, default is None
    
    :param str latname: 
        the name of the latitude column, default is 'latitude'
    
    :param str longname: 
        the name of the longitude column, default is 'longitude'
    
    :param str datename: 
        the name of the date column, default is 'date'
        
    Returns:
    ________
    
    :return: a pandas.DataFrame compatible with :class:`NBDDataFrame`
    :rtype: pandas.DataFrame

    """     
    
    newdf = df.rename(columns={latname:'latitude', longname:'longitude',
                       datename:'date'})
    if nbdname:
        newdf.rename(columns={nbdname:'nbd'}, inplace=True)
    
    newdf['date'] = pd.to_datetime(newdf['date'])    
           
    return newdf
    


[docs]def get_csv_data(filename, nbdname=None, latname='latitude',
                 longname='longitude', datename='date', sep='\t'):
    """
    Read neighborhood data from a csv into a pandas.DataFrame
    compatible with :class:`NBDDataFrame`. The csv data should have at
    the least latitude and longitude columns with names *latname* and
    *longname*, and a date column with name *datename*. A neighborhood
    column with name *nbdname* is optional.
    
    Parameters:
    ___________
    
    :param str filename: 
        the name of the csv file
        
    :param str nbdname: 
        (optional) the name of the neighborhood column
        if there is one, default is None
    
    :param str latname: 
        the name of the latitude column, default is 'latitude'
    
    :param str longname: 
        the name of the longitude column, default is 'longitude'
    
    :param str datename: 
        the name of the date column, default is 'date'
    
    :param str sep: 
        the separation character, default is tab
    
    Returns:
    ________
    
    :return: a pandas.DataFrame compatible with :class:`NBDDataFrame`
    :rtype: pandas.DataFrame
    
    For example,
    
    >>> from datatools.nbddataframe import testdata, get_csv_data
    >>> fil = open('test.csv', 'w') #make a test csv file
    >>> fil.write(testdata) #write data randomly generated for test purposes 
    >>> fil.close()
    >>> df = get_csv_data(
    ...                   filename='test.csv', nbdname='neighborhood',
    ...                   latname='lat', longname='lon', datename='date', 
    ...                   sep='\t'
    ... )
    >>> df.head()
            val   latitude   longitude      rand nbd       date
    0  4.076444  47.600025 -122.373928  0.127659   B 2000-01-01
    1  4.252051  47.400000 -122.341304  0.875592   A 1986-06-20
    2  1.322697  47.634556 -122.344292  0.365462   B 1987-10-13
    3 -9.362756  47.650959 -122.299352  0.095750   B 2000-01-01
    4       NaN  47.544586 -122.343625  0.171634   B 2000-01-01
    >>> df.loc[0, 'nbd']
    'B'
    >>> df.mean().loc['latitude']
    47.597802519907695
        
    """

    df = pd.read_csv(filename, sep='\t', parse_dates=[datename])
    df.rename(columns={latname:'latitude', longname:'longitude',
                       datename:'date'}, inplace=True)
    if nbdname:
        df.rename(columns={nbdname:'nbd'}, inplace=True)
    
    return df
    
    
#test dataframe
testdataframe = get_csv_data(
                  filename=StringIO(testdata), nbdname='neighborhood',
                  latname='lat', longname='lon', datename='date', sep='\t'
)


[docs]def get_db_data(engine, tablename='nbddata', nbdname=None, 
                latname='latitude', longname='longitude', datename='date',
                index_col=None):
    """
    Read neighborhood data from a database into a pandas.DataFrame 
    compatible with :class:`NBDDataFrame`. The table should have at the least 
    latitude and longitude columns with names *latname* and *longname*, and a 
    date column with name *datename*. A neighborhood column with name 
    *nbdname* is optional.
    
    Parameters:
    ___________
    
    :param sqlalchemy.engine.Engine engine: 
        the database engine
        
    :param str tablename: 
        the name of the database table, default is 'nbddata'
        
    :param str nbdname: 
        (optional) the name of the neighborhood column if
        there is one, default is None
        
    :param str latname: 
        the name of the latitude column, default is 'latitude'
        
    :param str longname: 
        the name of the longitude column, default is 'longitude'
        
    :param str datename: 
        the name of the date column, default is 'date'
        
    :param str index_col:
        the name of the index column if there is one, default is None 
    
    Returns:
    ________
    
    :return: a DataFrame compatible with :class:`NBDDataFrame`
    :rtype: pandas.DataFrame
    
    For example,

    >>> from datatools.nbddataframe import testdataframe, NBDDataFrame
    >>> from datatools.nbddataframe import make_db, get_db_data
    >>> neigh_dataframe = NBDDataFrame(testdataframe)
    >>> engine = make_db(nbddf=neigh_dataframe, tablename='neigh_data')
    >>> df2 = get_db_data(engine=engine, tablename='neigh_data', 
    ...                   nbdname='nbd', latname='latitude', 
    ...                   longname='longitude', datename='date'
    ... )
    >>> df2.loc[1, 'nbd']
    u'A'
    
    """
        
    df = pd.read_sql_table(table_name=tablename, con=engine,
                           parse_dates=[datename], index_col=index_col)
    colrndict = {latname:'latitude', longname:'longitude', datename:'date'}
    df.rename(columns=colrndict, inplace=True)
    if nbdname:
        df.rename(columns={nbdname:'nbd'}, inplace=True)
    
    return df
    
    
[docs]def make_db(nbddf, dbname=None, tablename='nbddata'):
    """
    Write :class:`NBDDataFrame` data into a SQLite database.
    
    Parameters:
    ___________
    
    :param NBDDataFrame nbddf: 
        the :class:`NBDDataFrame` object containing the data
        
    :param str dbname: 
        the name of the database; if no name is given, 
        the database will be in-memory-only, default is None
        
    :param str tablename: 
        the name of the table, default is 'nbddata'
    
    Returns:
    ________
    
    :returns: an engine to the database
    :rtype: sqlalchemy.engine.Engine
    
    For example,

    >>> from datatools.nbddataframe import testdataframe, NBDDataFrame
    >>> from datatools.nbddataframe import make_db
    >>> neigh_dataframe = NBDDataFrame(testdataframe)
    >>> engine = make_db(nbddf=neigh_dataframe, tablename='neigh_data')
    >>> con = engine.connect()
    >>> res = con.execute("select latitude from neigh_data where nbd = 'A'")
    >>> data = res.fetchall()
    >>> data[0][0]
    47.4
    >>> con.close()
    
    """
    #make the engine
    if dbname is None:
        engine = create_engine('sqlite://')
    else:
        engine = create_engine('sqlite:///{}.db'.format(dbname))
    
    nbddf.get_df().to_sql(name=tablename, con=engine)
    return engine    
       

[docs]class NBDDataFrame(object):
    """
    A neighborhood data cleaner and preliminary analyzer.
    
    Parameters:
    ___________
    
    :param pandas.DataFrame df: 
        the neighborhood data: at the least,
        it should have *latitude*, *longitude* and *date* columns
        
    :param float min_lat:
        the minimum considered latitude, default is :attr:`minlat`

    :param float max_lat:
        the maximum considered latitude, default is :attr:`maxlat`

    :param float min_long:
        the minimum considered longitude, default is :attr:`minlong`

    :param float max_long:
        the maximum considered longitude, default is :attr:`maxlong`
        
    :param bool debug:
        if True, produce verbose output; default is False
        
    Raises:
    _______
    
    :raises Exception: if *df* does not have the required columns
    
    For example, we read a test DataFrame into an :class:`NBDDataFrame`.
        
    >>> from datatools.nbddataframe import testdataframe, NBDDataFrame
    >>> nbddf = NBDDataFrame(testdataframe, debug=True)
    The DataFrame is in the correct format
    
    An exception is raised if the DataFrame is not in the correct format
    (in this case, a *date* column is missing). 
    
    >>> nbddf = NBDDataFrame(testdataframe[['latitude', 'longitude']])
    Traceback (most recent call last):
        ...
    Exception: DataFrame format error

    To print missing data info,
    
    >>> nbddf = NBDDataFrame(testdataframe)
    >>> print nbddf.print_info()
    The number of rows is 13.
    2 rows are missing val.
    1 rows are missing longitude.
    2 rows have out-of-bounds location.
    
    To remove rows with missing location or date data,
     
    >>> nbddf.remove_missing_data()
    >>> print nbddf.print_info()
    The number of rows is 12.
    2 rows are missing val.
    2 rows have out-of-bounds location.
    
    To remove rows with out-of-bounds locations,
    
    >>> nbddf.remove_outofbounds_data()
    >>> print nbddf.print_info()
    The number of rows is 10.
    2 rows are missing val.
    
    Preliminary plots:
    
    >>> nbddf.plot_rowcount_by_month()
    >>> nbddf.plot_map()

    
    .. todo::
       * function to add neighborhoods
       * cleaning functions
       * grouping functions
       * vis/analysis functions
       * normalization functions
       * Use shapefiles to get nbd instead of nearest-nbr predictor.          
    
    """
    
    def __init__(self, df, min_lat=minlat, max_lat=maxlat, min_long=minlong, 
                 max_long=maxlong, debug=False):

        self.df = df
        
        required_cloumns = set(['latitude', 'longitude', 'date'])
        if not required_cloumns.issubset(df.columns):
            raise Exception('DataFrame format error')
        elif debug:
            print "The DataFrame is in the correct format"
            
        self.min_lat = min_lat
        self.max_lat = max_lat
        self.min_long = min_long
        self.max_long = max_long

        self.seattlemap = None            
            
    
[docs]    def print_info(self):
        """
        Print the number of rows, number of missing values for each column,
        and number of rows with location out of bounds.
        
        Returns:
        ________
        
        :returns: the above information
        :rtype: str
        
        """
        df = self.get_df()
        
        count = df.count()
        num_rows = len(df)
        printstr = "The number of rows is {}.\n".format(num_rows)
        missing_count = num_rows - count
        for col in missing_count.index:
            if missing_count[col] > 0:
                missstr = "{num} rows are missing {colu}.\n"
                printstr += missstr.format(num=missing_count[col], colu=col)

        out_of_bounds = len(df[(df.latitude < self.min_lat) | 
                               (df.latitude > self.max_lat) |
                               (df.longitude < self.min_long) | 
                               (df.longitude > self.max_long)]
        )

        if out_of_bounds > 0:
            ofbstr = "{num} rows have out-of-bounds location.\n"
            printstr += ofbstr.format(num=out_of_bounds)                
                
        return printstr[:-1]
                
[docs]    def remove_missing_data(self):
        """
        Remove rows with missing location or date data.
        
        """
        
        self.df.replace({'latitude' : 0, 'longitude' : 0}, None)
        self.df = self.df[(self.df.date.notnull()) & 
                          (self.df.latitude.notnull()) & 
                          (self.df.longitude.notnull())
        ]
        
[docs]    def remove_outofbounds_data(self):
        """
        Remove rows with out-of-bounds locations.
        
        """
        
        self.df = self.df[(self.df.latitude >= self.min_lat) & 
                          (self.df.latitude <= self.max_lat) & 
                          (self.df.longitude >= self.min_long) & 
                          (self.df.longitude <= self.max_long)
        ]
            
   
[docs]    def plot_rowcount_by_month(self, df=None,
                               filename="rowcount_by_month.png"):
        """
        Plot the number of rows by month.
        
        Parameters:
        ___________
        
        :param DataFrame df:
            the data to plot: if None, 
            then this object's underlying DataFrame is used, 
            default is None
        
        :param str filename: 
            the name of the file with the plot, 
            default is "rowcount_by_month.png"
        
        """
        
        if df is None:
            df = self.get_df()
            
        ax = plt.subplot(111)
        numrowsbydate = df[['date']].groupby(df.date).count()
        resamplebymonth = numrowsbydate.resample("M", how="sum")
        ax.plot(resamplebymonth.index, resamplebymonth)
        ax.set_title("Row count by month")
        plt.savefig(filename, dpi=200)
        plt.clf()
        
        
[docs]    def plot_map(self, df=None, filename="row_locations_map.png"):
        """
        Plot the number of rows by month.
        
        Parameters:
        ___________
        
        :param DataFrame df:
            the data to plot: if None, 
            then this object's underlying DataFrame is used, 
            default is None
        
        :param str filename: 
            the name of the file with the plot, 
            default is "rowcount_by_month.png"
        
        """

        if df is None:
            df = self.get_df()
        
        if self.seattlemap is None:    
            self.setup_map()    
        
        locax = plt.subplot(111)                              
        self.seattlemap.drawcoastlines(ax=locax)
        locax.scatter(df.longitude, df.latitude, s=8, marker='.')
        locax.set_title("Locations")
        plt.savefig(filename, dpi=200)
        plt.clf()
                     
                
    def setup_map(self):
    
        self.seattlemap = Basemap(llcrnrlon = self.min_long, 
                                  llcrnrlat = self.min_lat, 
                                  urcrnrlon = self.max_long, 
                                  urcrnrlat = self.max_lat, 
                                  resolution='i')
                
[docs]    def get_df(self):
        """
        Get the underlying DataFrame.
        
        Returns:
        ________
        
        :returns: the underlying DataFrame
        :rtype: pandas.DataFrame
        
        """
        
        return self.df