Imports

In [2]:
#Do the imports at the start of the script
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
import sys
from sklearn.linear_model import Ridge, Lasso,ElasticNet
from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeRegressor
import numpy as np
/Users/rohitghosh/anaconda/lib/python3.5/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')
/Users/rohitghosh/anaconda/lib/python3.5/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')

Static Content for One Time Use

In [2]:
#Read the csv data files
trip = pd.read_csv('trip_data_4.csv')
fare = pd.read_csv('trip_fare_4.csv')
/home/rohit/anaconda3/lib/python3.5/site-packages/IPython/core/interactiveshell.py:2723: DtypeWarning: Columns (4) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
In [1]:
#Capture the date of pickup from the DateTimeStamp of pickup_time column
trip['pickup_day'] = pd.DatetimeIndex(trip[' pickup_datetime']).day
trip.to_pickle('trip_pickup_day.pkl')
/home/rohit/anaconda3/lib/python3.5/site-packages/IPython/core/interactiveshell.py:2723: DtypeWarning: Columns (4) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)

Code snippets for the variable distribution plotting

In [6]:
print (len(fare))
fare.head()
15100468
Out[6]:
medallion hack_license vendor_id pickup_datetime payment_type fare_amount surcharge mta_tax tip_amount tolls_amount total_amount
0 91F6EB84975BBC867E32CB113C7C2CD5 AD8751110E6292079EB10EB9481FE1A6 CMT 2013-04-04 18:47:45 CRD 11.0 1.0 0.5 2.50 0.0 15.00
1 EC34CD1B3797DFAFF3FE099BA87B6656 8FE6A4AEDF89B6B4E19D2377FD3FB7D7 CMT 2013-04-05 07:08:34 CRD 8.5 0.0 0.5 1.80 0.0 10.80
2 C1B9DA774DC2BBC6DE27CE994E7F44A0 E1B595FD55E4C82C1E213EB17438107A CMT 2013-04-04 17:59:50 CRD 16.5 1.0 0.5 3.60 0.0 21.60
3 9BA84250355AB3FC031C9252D395BF8A 16BB0D96A0DCC853AEC7F55C8D6C71E0 CMT 2013-04-04 18:12:01 CRD 10.0 1.0 0.5 3.45 0.0 14.95
4 205A696DF62AD03C88DA8C5EC5248639 579C41EA5EC846F8B641A42F9EE3E855 CMT 2013-04-04 20:12:57 CRD 15.0 0.5 0.5 3.20 0.0 19.20
In [7]:
print (len(trip))
trip.head()
15100468
Out[7]:
medallion hack_license vendor_id rate_code store_and_fwd_flag pickup_datetime dropoff_datetime passenger_count trip_time_in_secs trip_distance pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude
0 91F6EB84975BBC867E32CB113C7C2CD5 AD8751110E6292079EB10EB9481FE1A6 CMT 1 N 2013-04-04 18:47:45 2013-04-04 19:00:25 1 759 2.5 -73.957855 40.765320 -73.976273 40.785648
1 EC34CD1B3797DFAFF3FE099BA87B6656 8FE6A4AEDF89B6B4E19D2377FD3FB7D7 CMT 1 N 2013-04-05 07:08:34 2013-04-05 07:17:34 1 540 1.6 0.000000 0.000000 0.000000 0.000000
2 C1B9DA774DC2BBC6DE27CE994E7F44A0 E1B595FD55E4C82C1E213EB17438107A CMT 1 N 2013-04-04 17:59:50 2013-04-04 18:21:48 1 1318 3.6 -73.982880 40.754990 -74.009186 40.715374
3 9BA84250355AB3FC031C9252D395BF8A 16BB0D96A0DCC853AEC7F55C8D6C71E0 CMT 1 N 2013-04-04 18:12:01 2013-04-04 18:25:24 1 799 1.9 -73.978119 40.763451 -73.955666 40.776642
4 205A696DF62AD03C88DA8C5EC5248639 579C41EA5EC846F8B641A42F9EE3E855 CMT 1 N 2013-04-04 20:12:57 2013-04-04 20:29:55 1 1017 3.6 -74.006371 40.744755 -73.961662 40.761082
In [45]:
#Extracting frequency of passeneger in a cab into 2 lists - index and respective counts
#Plotting the counts and setting the corresponding xticks as the index 
passenger_count = trip[' passenger_count'].value_counts().sort_index()
passenger_count_list_index = [index for index in passenger_count.index]
passenger_count_list_values =[passenger_count[index] for index in passenger_count.index]

no_of_types = range(len(passenger_count))
plt.xticks([x+0.5 for x in no_of_types],passenger_count_list_index) #Adding 0.5 to place the xticks at middle of bar
plt.bar(no_of_types,passenger_count_list_values, alpha=0.5) #Discrete variable, hence bar chart
plt.yscale('log')                                          #Adding log scale, as the values fluctuate widely
plt.ylabel('No of trips')
plt.xlabel('No of passengers in cab')
plt.title('Distribution of passenger_counts')
plt.savefig('trip_passenger_count_distribution.png')
plt.show()
In [44]:
#Extracting frequency of mode of payment in a cab into 2 lists - index and respective counts
#Plotting the counts and setting the corresponding xticks as the index
payment_type = fare[' payment_type'].value_counts()
payment_type_list_index = [index for index in payment_type.index]
payment_type_list_values =[payment_type[index] for index in payment_type.index]
no_of_types = range(len(payment_type))   
plt.xticks([x+0.5 for x in no_of_types],payment_type_list_index) #Adding 0.5 to place the xticks at middle of bar
plt.bar(no_of_types,payment_type_list_values, alpha=0.5) # Discrete variable, hence bar chart
plt.yscale('log')
plt.ylabel('No of trips')
plt.xlabel('Payment modes')
plt.title('Distribution of payment_types')
plt.savefig('fare_payment_type_distribution.png')
plt.show()
In [42]:
#Plot the histogram of fare amount frequqncy with min and max values as limit
#Continuous variable , hence a histogram being plotted
plt.hist(fare[' fare_amount'],alpha =0.5, bins = 100)
plt.xlim([fare[' fare_amount'].min(),fare[' fare_amount'].max()])
plt.yscale('log')           #log as the values were ranging across large range
plt.ylabel('No of trips')
plt.xlabel('Fare amount')
plt.title('Distribution of fare')
plt.savefig('fare_amount_distribution.png')
plt.show()
In [41]:
#Plot the histogram of ftip amount frequqncy with min and max values as limit
#Continuous variable , hence a histogram being plotted
plt.hist(fare[' tip_amount'],alpha =0.5, bins = 100)
plt.xlim([fare[' tip_amount'].min(),fare[' tip_amount'].max()])
plt.yscale('log')
plt.ylabel('No of trips')
plt.xlabel('Tip amount')
plt.title('Distribution of tips')
plt.savefig('tip_amount_distribution.png')
plt.show()
In [14]:
#Plot the histogram of total amount frequqncy with min and max values as limit
#Continuous variable , hence a histogram being plotted
plt.hist(fare[' total_amount'],alpha =0.5, bins = 100)
plt.xlim([fare[' total_amount'].min(),fare[' total_amount'].max()])
plt.yscale('log')
plt.ylabel('Frequency of total_amount')
plt.title('Distribution of total_amount')
plt.savefig(' total_amount_distribution.png')
plt.show()
In [39]:
#Assumption: Busiest hour would imply both pickups and dropoffs are highest
#First generating pickup and dropoff hours based on timestamp and then generating frequency of pickup & dropof
#Discrete variable , hence a bar chart being plotted

trip['pickup_hour'] = trip[' pickup_datetime'].map( lambda x: pd.to_datetime(x).hour )

trip['dropoff_hour'] = trip[' dropoff_datetime'].map( lambda x: pd.to_datetime(x).hour )

#Combine pickup & dropoff frequency per hour
time_distribution = trip['pickup_hour'].value_counts().add(trip['dropoff_hour'].value_counts()) 
time_distribution.plot.bar(alpha=0.5)
print ("The top 5 busiest hours in descending order are")
busiest_hours = time_distribution.sort_values(ascending=False).index[:5].tolist() #Sort the series based on frequency
print (busiest_hours)
plt.ylabel('No. of trips')
plt.title('Distribution of trips - hourly basis')
plt.savefig('time_distribution.png')
plt.show()
The top 5 busiest hours in descending order are
[19, 20, 18, 21, 22]

Creating sample dataset for later use

Working with the entire dataset was becoming tough, with processing times for each snippets running into hours - hence I took the relevant columns from fare dataset and concatenated it with the trip dataset as those columsn from fare datset were the only relevant columns I was going to use henceforth

In [55]:
trip = pd.read_pickle('trip_pickup_day.pkl')

#Getting the trip date, from DateTimeStamp
print ("Getting the hour data of the pickups.....")
trip['pickup_hour'] = trip[' pickup_datetime'].map( lambda x: pd.to_datetime(x).hour )

print ("Reading the fare csv file.....")
fare = pd.read_csv('trip_fare_4.csv')
fare = fare[[' fare_amount', ' tip_amount']]

#Merging the columns fare amount and tip amount, as they are only relevant columns from fare dataset
print ("Concatenating the fare & tip amount to the original trip dataset")
trip_new = pd.concat([trip, fare], axis=1, join_axes=[trip.index])

print("Saving the final datset")
trip_new.to_pickle('ultimate_datset.pkl')
Getting the hour data of the pickups.....
Reading the fare csv file.....
Concatenating the fare & tip amount to the original trip dataset
Saving the final datset

Rest of basic questions & optimisation problems

Strategy - Choosing a particular day

The processing time was extremely high for any operation as the dataset was huge. For rest of part, with loss of some amount of generality, I choose to select a single day for representation of the month. Evidently it's biased as seen from the distribution plotted below, but nonetheless,operating with the huge dataset would've been tough on this 2GB laptop, hence 1st April, 2013, with least rides in the month (refer Appendix) has been chosen for references hereon

In [3]:
#Read data and choose the day with least records to save processing time
trip = pd.read_pickle('ultimate_datset.pkl')
trip['pickup_day'].value_counts().plot.bar(alpha=0.5)
plt.savefig('day_of_month_dist.png')
plt.show()

#Choosing Monday, though the inherent bias is evident as the Mondays
#seems to have the lowest pickups in the entire month
trip_1=trip[trip['pickup_day']==1]
trip_1.to_pickle('trip_1.pkl')

Strategy - Choosing a grid

For answering rest of basic questions on trips and locations (join distribution of lat & long), I choose to superimpose a grid on the map of NYC, dividing the regions into small squares.

The location of NYC,as per Google, is given 40.71 , -74.00 so ideally the grid should contain the co-ordinates. The objective of grid is chosen such that the least frequency in the region is atleast 1000, centred around the mode. As per distributions plotted in Appendix 1, we choose the grid boundaries as [40.65,40.85] for latitude [-74.05, -73.75] for longitude

Also, as per research, NYC ~ 800 sq km, and every 1 degree lat ~ 110 km and 1 degree long ~ 90 km at 40 degree lat - we are assuming the above chosen rectangular grid can very weel mimic the dynamics of NYC city to a good extent

Creating grid-based regions

In [13]:
class grid_network:
    
    def __init__(self,lat = (40.85,40.65), long = (-73.75,-74.05),x_space=0.02, y_space=0.02):
        
        #boundaries of the grid and spacing are stored as class variables
        
        self.lat_1 = min(lat)
        self.lat_2 = max(lat)

        self.long_1 = min(long)
        self.long_2 = max(long)
        
        self.x_space = x_space
        self.y_space = y_space
    
    # functions to convert the co-ordinates to their pickup and dropoof regions, storing them as columns
    
    def pickup_region(self,df):
            lat_reg= ((df[' pickup_latitude'] - self.lat_1)//self.y_space).astype(int)
            long_reg = ((df[' pickup_longitude'] - self.long_1)//self.x_space).astype(int)
            df['pickup_reg'] = lat_reg.astype(str) + ',' + long_reg.astype(str)
            return df

    def dropoff_region(self,df):
            lat_reg= ((df[' dropoff_latitude'] - self.lat_1)//self.y_space).astype(int)
            long_reg = ((df[' dropoff_longitude'] - self.long_1)//self.x_space).astype(int)
            df['dropoff_reg'] = lat_reg.astype(str) + ',' + long_reg.astype(str)
            return df
    
    # function to  convert region to lat,long co-ordinates of the centre of the small square
    
    def reg_to_coord(self, reg):
        lat = round((int(reg.split(',')[-2])+0.5)*self.y_space + self.lat_1,3)
        long = round((int(reg.split(',')[-1])+0.5)*self.x_space + self.long_1,3)
        return '{},{}'.format(lat,long)
    
    #function to get the busy regions of the city and convert it back to lat, long
    #busy would mean dropoff and pickups are both high
    #passing val =10 in the function to choose top 10 busy locations
    
    def get_busy_locations(self,df,val=10):
            locations = (df['dropoff_reg'].value_counts()).add(df['pickup_reg'].value_counts(), fill_value =0)

            print("The top {} busiest locations are ".format(val))

            for key in locations.sort_values(axis=0, ascending = False).keys()[:val]:
                    print (self.reg_to_coord(key), key, locations[key])

    # function that gets the trips with the least sigma of travel time    
     
    def get_region_with_highest_sigma_time(self,df):
        
        df_region_with_highest_sigma = df.groupby(['trip_route']).agg({' trip_time_in_secs':'std', 'trip_route':'count'})
        #applying a filter of 10 trips so that otlier datapoints aren't considered for packing
        df_region_with_highest_sigma = df_region_with_highest_sigma[df_region_with_highest_sigma['trip_route']>10]
        
        item = df_region_with_highest_sigma.sort_values(' trip_time_in_secs', ascending =False).index[0]
        keys = df_region_with_highest_sigma[' trip_time_in_secs'].loc[item]

        print ("The trip with max sigma in time : route {},pickup at {} and drop_off at {} and is {}s".format(item,
             self.reg_to_coord(item.split('-')[-2]),self.reg_to_coord(item.split('-')[-1]),round(keys,2)))
    
    #function that gets the trips with least sigma of fares
    def get_region_with_consistent_fare(self,df):
        
        df_consistent_fares = df.groupby(['trip_route']).agg({' fare_amount':'std', 'trip_route':'count'})
        #applying a filter of 10 trips so that otlier datapoints aren't considered for packing
        df_consistent_fares = df_consistent_fares[df_consistent_fares['trip_route']>10]
        
        item = df_consistent_fares.sort_values(' fare_amount',ascending = True).index[0]
        keys = df_consistent_fares[' fare_amount'].loc[item]
       

        print ("The trip with min variation in fare : route {},pickup at {} and drop_off at {} and is {}".format(item,
             self.reg_to_coord(item.split('-')[-2]),self.reg_to_coord(item.split('-')[-1]),round(keys,2)))

    # function to create the regions 
    def create_regions(self,df):

        print ("Creating borders..")
        
        #selcting the dataframe so that all pickups and dropoffs lie withing the boundaries of grid
        df = df[(df[' pickup_longitude'] > self.long_1) & (df[' pickup_longitude'] < self.long_2)] 
        df = df[(df[' pickup_latitude'] > self.lat_1) & (df[' pickup_latitude'] < self.lat_2)]
        df = df[(df[' dropoff_longitude'] > self.long_1) & (df[' dropoff_longitude'] < self.long_2)] 
        df = df[(df[' dropoff_latitude'] > self.lat_1) & (df[' dropoff_latitude'] < self.lat_2)] 
        df = self.pickup_region(df)
        df = self.dropoff_region(df)
        
        #define trip route as string 'region_pickup-region_dropoff"
        df['trip_route']  = df['pickup_reg'] + '-' + df['dropoff_reg']

        #checking no of trips with source and sink as different regions
        #ideally this should be high so as to capture dynamics within regions 
        print ("No of trips to another region : {:7d}".format(len(df.index[df['pickup_reg']!=df['dropoff_reg']])))
        
        #In goal to maximie the above quantity, regions could've been so small
        #that there are not enough trips in any region to draw conclusions
        print ("No. of trips with frequency greater than 5 in day :{} ".format(sum(df['trip_route'].value_counts()>5)))
        
        #store the processed data
        filename = 'df_full_x_space_{}_y_space{}.pkl'.format(10*self.x_space,10*self.y_space)
        df.to_pickle(filename)


        return df,filename

   

Snippet to create the grid & adress rest of the basic questions,specific only to the grid

To do this you need to have executed the last block of code Creating grid-based regions

In [3]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    df = pd.read_pickle('trip_1.pkl') #read the previously pickled file
    trip = df                        # create another copy of the dataframe
    grid = grid_network(x_space = 0.01, y_space = 0.01) #initialise the class
    trip,filename = grid.create_regions(trip)  #create grid
    grid.get_busy_locations(trip)    #get the top 10 busy locations
    grid.get_region_with_highest_sigma_time(trip) # get the trip route with highest sigma in travel time
    grid.get_region_with_consistent_fare(trip) # get the route with least sigma in fares
    print (filename)         #print the filename for the next part of problem
Creating borders..
No of trips to another region :  382827
No. of trips with frequency greater than 5 in day :4370 
The top 10 busiest locations are 
40.755,-73.975 10,7 60056.0
40.755,-73.985 10,6 42379.0
40.745,-73.985 9,6 38260.0
40.765,-73.985 11,6 35898.0
40.755,-73.995 10,5 34869.0
40.765,-73.975 11,7 34137.0
40.745,-73.995 9,5 32279.0
40.765,-73.965 11,8 31396.0
40.775,-73.955 12,9 27990.0
40.735,-73.995 8,5 26140.0
The trip with max sigma in time : route 7,3-7,3,pickup at 40.725,-74.015 and drop_off at 40.725,-74.015 and is 1281.53s
The trip with min variation in fare : route 6,6-6,7,pickup at 40.715,-73.985 and drop_off at 40.715,-73.975 and is 0.52
df_full_x_space_0.1_y_space0.1.pkl

Diving into optimisation problems

To do this you need to have executed

  1. Creating grid-based regions
  2. Snippet to create the grid & answer the rest of the basic questions based on the grid

The nature of the relationship between trip fare and trip time needed to be established before proceeeding with the problem statement.

The model seems linear,at least from a day's data, so we go for the linear model data ( See Appendix 2 for more details)

Optimisation model for maximum earnings

As it turns out the relationship is linear, so we calculate "mean fare per min travelled" for each hour for each source location and sink location.

This the plot of distribution of "mean fare per min travelled". If done more finely, it's observed that "mean fare per min travelled can hardly go beyond 4, so applying that filter on "mean fare per min travelled" makes sense. (See Appendix 3 for the plot)

In [9]:
filename = 'NYC_Cabs_Data Challenge/df_full_x_space_0.1_y_space0.1.pkl'
df = pd.read_pickle(filename)

df_hourly_rate_per_route = df.groupby(['pickup_hour','pickup_reg','dropoff_reg']).agg({' trip_time_in_secs':'mean',' fare_amount':'mean','dropoff_reg':'count' })
#replace the trip times with 0 secs (very likely a wrong data) with a high no 
#so that mean fare per min is a small no and isn't considered for optimisation
df_hourly_rate_per_route[' trip_time_in_secs']=df_hourly_rate_per_route[' trip_time_in_secs'].replace(0,10000)
df_hourly_rate_per_route['mean_fare_per_min']= (df_hourly_rate_per_route[' fare_amount']*60).divide(df_hourly_rate_per_route[' trip_time_in_secs'])

#the dataframe for calculating the avg trip times for each hour, and for each pair ofpickup and dropoff regions
df_avg_time_routes = df.groupby(['pickup_reg','pickup_hour','dropoff_reg'])[' trip_time_in_secs'].agg({'mean_travel_time' : 'mean'})

#the dataframe for calculating demand for each region at each hour, by counting the no of pickups at the region
df_demand_hourly_regions = df.groupby(['pickup_hour', 'pickup_reg'])['pickup_reg'].agg({'avg_hourly_demand': 'count'})

#the "mean fare per min travelled" parameter filtered of outlier datapoints by applying filter value of 4
df_hourly_rate_per_route['mean_fare_per_min'] = df_hourly_rate_per_route['mean_fare_per_min'].apply(lambda x: x if x < 4 else 0)
In [24]:
def select_next_opt_trip(start_time, start_reg = None, end_time=23,total_fare=0,trip_no=0, start=True,next_day=False, verbose = 1):
    
    #flag to make sure if driving onto next_day, end_time > start_time
    if next_day:
        end_time = end_time +24
        next_day =False
    
    #flaf to make sure start_time is made to 0, as soon as it's past 23:59 
    if start_time >= 24:
        start_time= start_time -24
        end_time = end_time - 24
    
    #if the flag is start, we start with region which has the highest "mean fare per min" at that hour
    if start:
        start_hour =int(start_time)
        start_reg = df_hourly_rate_per_route['mean_fare_per_min'].loc[start_hour].idxmax()[0]
        if verbose > 0 :
            print ("He starts in the region {}, [{}] at {}".format(start_reg,grid.reg_to_coord(start_reg),start_time))
        start=False
    
    # check for the start_time is lesser than end_time - the stopping criterion
    if start_time < end_time:
        if round(start_time)==24:  #to make sure that after 23:30 hrs, start_hour isn't made to be 24
            start_hour =0
        else:                   
            start_hour = round(start_time) # if round(start_time) isn't 24 , round it to nearest integer
        
        #get the list of possible dropoff regions at the start location at that hour
        df = df_avg_time_routes.loc[start_reg].loc[start_hour]
        
        #get the avg time for arrival at each of possible dropoff locations
        df['hour_arrival_dropoff'] = round(df['mean_travel_time'].divide(3600) + start_time).astype(int)
        df['expected_demand_arrival_flag'] =0
        df['fare_per_min'] =0
        
        #get the demand at each dropoff location at the corresponing time of arrival & check if demand > threshold
        #threshold is calculated based on median of demands at that hour across all regions
        for i in range(len(df)):
            try:
                threshold = df_demand_hourly_regions.loc[df['hour_arrival_dropoff'].iloc[i]]['avg_hourly_demand'].median()*0.2
                df['expected_demand_arrival_flag'].iloc[i] = df_demand_hourly_regions.loc[df['hour_arrival_dropoff'].iloc[i]].loc[df.index[i]][0] > threshold 
            except:
                continue
        
        #get the "mean fare per min" parameter for all possible dropoff locations 
        for i in range(len(df)):
            try: 
                df['fare_per_min'].iloc[i] = df_hourly_rate_per_route.loc[df['hour_arrival_dropoff'].iloc[i]].loc[start_reg].loc[df.index[i]]['mean_fare_per_min']  
            except:
                continue
        
        #calculate optimum score as product of maximising parameter - "mean fare per min" & demand flag
        df['optimum_score']=df['fare_per_min'].multiply(df['expected_demand_arrival_flag'])

        #see if the best optimum score exists then keep on recursively checking for next rides
        try:
            dropoff_reg = df['optimum_score'].idxmax()
            
            #if the best score is 0, this implies likely that demand in the best_possible dropoff region is below threshold
            #so search for the nearest best region in the neighbourhood
            if df['optimum_score'].max() ==0:
                if verbose > 0 :
                    print ("Demand is below the threshold, so searching around..")
                adj_reg,travel_time = search_adjacent_region(start_reg,start_time,verbose = verbose)
                start_time += travel_time
                if verbose > 0 :
                    print ("Starting journey from adjacent region {} at {}:{:2.0f}".format(adj_reg, int(start_time),
                                                                              (start_time-int(start_time))*60))
                
                return select_next_opt_trip(start_time,adj_reg,end_time, total_fare,trip_no,start,next_day,verbose)
            
            else:
                
                #Calculate expected dropoff time , fare and keep on recursively choosing the best trip
                dropoff_time = round(start_time + (df['mean_travel_time'].loc[dropoff_reg])/3600,2) + 0.15 # added 0.05 hrs ~ 3 min to account for fare collection and next booking pickup
                trip_no+=1
                
                #calculate expected fare amount 
                expected_mean_fare=round(df_hourly_rate_per_route.loc[start_hour].loc[start_reg].loc[dropoff_reg][' fare_amount'],2)
                total_fare+=expected_mean_fare
                if verbose > 0 :
                    print ('Trip {}: Next dropoff is at {}, [{}] and at {}:{:2.0f}, expected_fare : {:4.2f}'.format(trip_no,
                                                                                       dropoff_reg,
                                                                                       grid.reg_to_coord(dropoff_reg),
                                                                                       int(dropoff_time),
                                                                                       round((dropoff_time-int(dropoff_time))*60),
                                                                                      expected_mean_fare))

                
                return select_next_opt_trip(dropoff_time, dropoff_reg,end_time, total_fare,trip_no,start,next_day,verbose)
                
        except:
            
            #if there's no demand at the current pickup region, need to search nearest region for a good pickup
            if verbose > 0 :
                print ("No demand for cab at region {}".format(start_reg))
            adj_reg,travel_time = search_adjacent_region(start_reg,start_time,verbose = verbose)
            start_time += travel_time
            if verbose > 0 :
                print ("Starting journey from adjacent region {} at {}:{:2.0f}".format(adj_reg, int(start_time),
                                                                              (start_time-int(start_time))*60))
            
            return select_next_opt_trip(start_time,adj_reg,end_time, total_fare,trip_no,start,next_day,verbose)
            
        
        
    else:
        if verbose > 0 :
            print ("It's already {}, time to pack".format(end_time))
            print ("Already earnt fare amount {} for the day!".format(round(total_fare,2)))
        return total_fare



def search_adjacent_region(reg,time,degree=3,verbose =1 ):
    
    if verbose > 0 :
        print ("Searching adjacent regions around {} at {}:{:2.0f}....".format(reg,int(time),(time-int(time))*60))
    time = int(time)
    best_reg = None
    best_mean_fare = -1
    
    #vals is the variable indication degree of search
    #if it's 1 degree search , just search adjacent 8 squares for best "mean fare per min" parameter alongwith demand as well
    #search all degrees till we don't find a region with demand
    for vals in range(1,degree):
        reg_1 = int(reg.split(',')[-2]) 
        reg_2 = int(reg.split(',')[-1])
        reg_list =[]
        # form a list of all the adjacent regions to search on in ith degree search 
        for i in range(-vals,vals+1):
            for j in range(-vals,vals+1):
                if (reg_1+i >= 0) & (reg_2+j >=0): 
                    reg_element = str(reg_1+i) +',' +str(reg_2+j)
                    reg_list.append(reg_element)
        #for all the regions to be searched in the ith degree check for the best " mean fare per min" if there's demand
        for regs in reg_list:
            
            try:
                mean_fare = df_hourly_rate_per_route['mean_fare_per_min'].loc[time].loc[regs].max()
            
                travel_time_list = df_avg_time_routes['mean_travel_time'].loc[(df_avg_time_routes.index.get_level_values('dropoff_reg')==reg) & 
                                                 (df_avg_time_routes.index.get_level_values('dropoff_reg') == regs)]

                if (mean_fare > best_mean_fare) & (regs!=reg):
                    best_mean_fare = mean_fare
                    best_reg = regs
                    best_travel_time_list = travel_time_list
            except:
                #if there's no demand at the region being searched , cotinue searching the other regions
                continue
        
        # if at the end of searching all regions in the ith degree search
        #even if there's one region with demand, we move there
        #if there are more than one, choose the one with best "mean fare per min"
        #if there's no region with demand, increase the degree of search
        if best_reg is not None:
            if len(best_travel_time_list) > 0:
                best_travel_time= (best_travel_time_list.mean())/3600
            else:
                #if there's not any data to predict the average time of travel between current region and adjacent 'best' region
                #take an average time of travel to the adjacent region as 6*degree of search mins (based on observations)
                if verbose > 0 :
                    print("Not enough data to get mean travel time to {} from {}, approx_time : {}".format(reg,best_reg,6*vals))
                best_travel_time = 0.1
            if verbose > 0 :
                print ("Nearest Best Region found at {} at distance of {} mins ".format(best_reg, round(best_travel_time*60,2)))
            return best_reg, round(best_travel_time,2)
        else:
            if verbose > 0 :
                print("No good region found near region {} in {} degree search".format(reg,vals))
            continue
        

Code snippet to run the optimisation algorithm

You can mention the start_time and end_time and print the routes the cab driver takes

Also, if you want, you can select a custom location where the cab needs to start, with flag 'start' set to False. If the flag is set to False, the code doesn't search for the best location to start from at that hour

To do this you need to have executed

  1. Creating grid-based regions
  2. Snippet to create the grid & answer the rest of the basic questions based on the grid
  3. Optimisation model for maximum earnings and the non-commented snippets beneath it
  4. Optimisation algorithm for getting maximum earnings

The snippet of code to execute the maximum earnings given the following parameters

  1. start_time -Starting time of the day - when the cab starts trips for the day
  2. start_reg (Optional)- Starting region (as per grid - should be of format 'i,j'of the cab.If not given , the algorithm chooses the initial starting point by itself. If given, make sure to set the start flag as False
  3. end_time (Optional) - The ending time of the day of cab. If not given , deafault set to be 23
  4. total_fare (Optional) - If it's start of the day, default set as 0, else if starting from a custom region at the mid-of the day, you can specify the total_amount earnt till then
  5. no_of_trips (Optional) - The trip no of the day. If not given default set o be 0
  6. start(Optional) - Flag to indicate if algo is to be implemented assumming start of day or to be applied from current location. If flag is false, start_reg needs to be mentioned, Default mode - True
  7. next_day (Optional) - To indicate whether to drive till next day ( that is to say, beyond 23:59 hrs).If flag is set False end_time should be greater than start_time, Default mode False
In [26]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    grid = grid_network(x_space = 0.01, y_space = 0.01)
    select_next_opt_trip(23,end_time = 1, next_day=True, verbose = 1) #use differet parameters as per requirement
He starts in the region 9,5, [40.745,-73.995] at 23
Trip 1: Next dropoff is at 8,5, [40.735,-73.995] and at 23:13, expected_fare : 5.11
Trip 2: Next dropoff is at 8,4, [40.735,-74.005] and at 23:25, expected_fare : 4.89
Trip 3: Next dropoff is at 9,5, [40.745,-73.995] and at 23:38, expected_fare : 5.46
Demand is below the threshold, so searching around..
Searching adjacent regions around 9,5 at 23:38....
Not enough data to get mean travel time to 9,5 from 10,6, approx_time : 6
Nearest Best Region found at 10,6 at distance of 6.0 mins 
Starting journey from adjacent region 10,6 at 23:44
Demand is below the threshold, so searching around..
Searching adjacent regions around 10,6 at 23:44....
Not enough data to get mean travel time to 10,6 from 9,5, approx_time : 6
Nearest Best Region found at 9,5 at distance of 6.0 mins 
Starting journey from adjacent region 9,5 at 23:50
Demand is below the threshold, so searching around..
Searching adjacent regions around 9,5 at 23:50....
Not enough data to get mean travel time to 9,5 from 10,6, approx_time : 6
Nearest Best Region found at 10,6 at distance of 6.0 mins 
Starting journey from adjacent region 10,6 at 23:56
Demand is below the threshold, so searching around..
Searching adjacent regions around 10,6 at 23:56....
Not enough data to get mean travel time to 10,6 from 9,5, approx_time : 6
Nearest Best Region found at 9,5 at distance of 6.0 mins 
Starting journey from adjacent region 9,5 at 24: 2
Trip 4: Next dropoff is at 9,5, [40.745,-73.995] and at 0:13, expected_fare : 3.65
Trip 5: Next dropoff is at 9,5, [40.745,-73.995] and at 0:24, expected_fare : 3.65
Trip 6: Next dropoff is at 9,5, [40.745,-73.995] and at 0:35, expected_fare : 3.65
Trip 7: Next dropoff is at 16,8, [40.815,-73.965] and at 0:55, expected_fare : 18.00
Trip 8: Next dropoff is at 15,8, [40.805,-73.965] and at 1: 5, expected_fare : 3.50
It's already 1, time to pack
Already earnt fare amount 47.91 for the day!

Snippet for Maximum Revenue in 6 Hours

We would see that median average time for every driver is 6 hours. The snippet calculates the optimised for revenue for a driver driving for 6 hours starting at diffeent times of the day

In [36]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    fares ={}
    for t in range(0,24):
        if t+6 > 24:
            next_day=True
            end_time = (t+6) - 24
        else:
            next_day = False
            end_time = (t+6)
        fares[t] = select_next_opt_trip(t,end_time = end_time, next_day=next_day,verbose = 0)
    
    plt.bar(range(len(fares)), fares.values(), align='center',alpha =0.5)
    plt.xticks(range(len(fares)), fares.keys())
    plt.show()    

Optimisation Model for earning the average fare but in minimum time

Driver average details - fare amount and hours worked

In [39]:
df_driver = df.groupby([' hack_license']).agg({' fare_amount':'sum', 'pickup_hour': lambda x:np.max(x)-np.min(x) if np.min(x) > 8 else -2})
df_driver = df_driver[df_driver['pickup_hour']>0]

print (df_driver['pickup_hour'].median())
print (df_driver[' fare_amount'].median())
6.0
177.5
In [40]:
def select_next_opt_trip_with_min_time(start_time, start_reg = None,total_fare=0,trip_no=0,total_time=0,start=True):
    
    #check if the start_time is within 23:59 hours, adjust accordingly if it shoots out    
    if start_time >= 24:
        start_time= start_time -24
    
    #check if it's start of the trip to select the location automatically
    if start:
        start_hour = round(start_time)
        start_reg = df_hourly_rate_per_route['mean_fare_per_min'].loc[start_hour].idxmax()[0]
    
    #check if the stopping criteria is reached or not
    if total_fare < 177.5 :
        if round(start_time)==24: #to make sure that after 23:30 hrs, start_hour isn't made to be 24
            start_hour = 0
        else:
            start_hour =round(start_time)# if round(start_time) isn't 24 , round it to nearest integer
        
        #get the list of possible dropoff regions at the start location at that hour
        df = df_avg_time_routes.loc[start_reg].loc[start_hour]
        
        #get the avg time for arrival at each of possible dropoff locations
        df['hour_arrival_dropoff'] = round(df['mean_travel_time'].divide(3600) + start_time).astype(int)
        df['expected_demand_arrival_flag'] =0
        df['fare_per_min'] =0
        
        #get the demand at each dropoff location at the corresponing time of arrival & check if demand > threshold
        #threshold is calculated based on median of demands at that hour across all regions
        for i in range(len(df)):
            try:
                threshold = df_demand_hourly_regions.loc[df['hour_arrival_dropoff'].iloc[i]]['avg_hourly_demand'].median()*0.2
                df['expected_demand_arrival_flag'].iloc[i] = df_demand_hourly_regions.loc[df['hour_arrival_dropoff'].iloc[i]].loc[df.index[i]][0] > threshold 
            except:
                continue
        #get the "mean fare per min" parameter for all possible dropoff locations 
        for i in range(len(df)):
            try: 
                df['fare_per_min'].iloc[i] = df_hourly_rate_per_route.loc[df['hour_arrival_dropoff'].iloc[i]].loc[start_reg].loc[df.index[i]]['mean_fare_per_min']  
            except:
                continue
        #calculate optimum score as product of maximising parameter - "mean fare per min" & demand flag
        df['optimum_score']=df['fare_per_min'].multiply(df['expected_demand_arrival_flag'])

        #see if the best optimum score exists then keep on recursively checking for next rides
        try:
            dropoff_reg = df['optimum_score'].idxmax()
            
            #if the best score is 0, this implies likely that demand in the best_possible dropoff region is below threshold
            #so search for the nearest best region in the neighbourhood
            if df['optimum_score'].max() ==0:
                adj_reg,travel_time = search_adjacent_region(start_reg,start_time)
                start_time += travel_time
                total_time += travel_time
                
            #Calculate expected dropoff time , fare and keep on recursively choosing the best trip
                return select_next_opt_trip_with_min_time(start_time,adj_reg, total_fare,trip_no,total_time,start=False)
            
            else:
                dropoff_time = round(start_time + (df['mean_travel_time'].loc[dropoff_reg])/3600,2) + 0.15
                total_time+= dropoff_time-start_time
                trip_no+=1
                 #calculate expected fare amount
                expected_mean_fare=round(df_hourly_rate_per_route.loc[start_hour].loc[start_reg].loc[dropoff_reg][' fare_amount'],2)
                total_fare+=expected_mean_fare

                
                return select_next_opt_trip_with_min_time(dropoff_time, dropoff_reg, total_fare,trip_no,total_time,start=False)
                
        except:
            #if there's no demand at the current pickup region, need to search nearest region for a good pickup
            adj_reg,travel_time = search_adjacent_region(start_reg,start_time)
            start_time += travel_time
            total_time += travel_time
        
            return select_next_opt_trip_with_min_time(start_time,adj_reg,total_fare,trip_no,total_time,start=False)
            
        
        
    else:
        return total_time



def search_adjacent_region(reg,time,degree=3 ):
    time = int(time)
    best_reg = None
    best_mean_fare = -1
    
    #vals is the variable indication degree of search
    #if it's 1 degree search , just search adjacent 8 squares for best "mean fare per min" parameter alongwith demand as well
    #search all degrees till we don't find a region with demand
    for vals in range(1,degree):
        reg_1 = int(reg.split(',')[-2])
        reg_2 = int(reg.split(',')[-1])
        reg_list =[]
        # form a list of all the adjacent regions to search on in ith degree search 
        for i in range(-vals,vals+1):
            for j in range(-vals,vals+1):
                if (reg_1+i >= 0) & (reg_2+j >=0): 
                    reg_element = str(reg_1+i) +',' +str(reg_2+j)
                    reg_list.append(reg_element)
        #for all the regions to be searched in the ith degree check for the best " mean fare per min" if there's demand
        for regs in reg_list:
            try:
                mean_fare = df_hourly_rate_per_route['mean_fare_per_min'].loc[time].loc[regs].max()
            
                travel_time_list = df_avg_time_routes['mean_travel_time'].loc[(df_avg_time_routes.index.get_level_values('pickup_reg')==reg) & 
                                                 (df_avg_time_routes.index.get_level_values('dropoff_reg') == regs)]

                if (mean_fare > best_mean_fare) & (regs!=reg):
                    best_mean_fare = mean_fare
                    best_reg = regs
                    best_travel_time_list = travel_time_list
            except:
                #if there's no demand at the region being searched , cotinue searching the other regions
                continue
        #if at the end of searching all regions in the ith degree search
        #even if there's one region with demand, we move there
        #if there are more than one, choose the one with best "mean fare per min"
        #if there's no region with demand, increase the degree of search   
        if best_reg is not None:
            if len(best_travel_time_list) > 0:
                best_travel_time= (best_travel_time_list.mean())/3600
            else:
                #if there's not any data to predict the average time of travel between current region and adjacent 'best' region
                #take an average time of travel to the adjacent region as 6*degree of search mins (based on observations)
                best_travel_time = 0.1
            return best_reg, round(best_travel_time,2)
        else:
            continue
In [41]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    best_total_time = 24
    for i in range(24):
        total_time = select_next_opt_trip_with_min_time(i)
        print ("Starting at {} total time to earn min avg fares is {}".format(i,round(total_time,2)))
        if total_time < best_total_time:
            best_total_time = total_time
            best_start_hour = i
print ("Best Start hour is {}, taking only {} hours to get to average fare while average time of travel for cab drivers are 6 hours".format(best_start_hour, round(best_total_time,2)))
Starting at 0 total time to earn min avg fares is 3.62
Starting at 1 total time to earn min avg fares is 4.77
Starting at 2 total time to earn min avg fares is 4.41
Starting at 3 total time to earn min avg fares is 4.47
Starting at 4 total time to earn min avg fares is 3.46
Starting at 5 total time to earn min avg fares is 3.96
Starting at 6 total time to earn min avg fares is 2.95
Starting at 7 total time to earn min avg fares is 3.58
Starting at 8 total time to earn min avg fares is 2.56
Starting at 9 total time to earn min avg fares is 3.06
Starting at 10 total time to earn min avg fares is 2.84
Starting at 11 total time to earn min avg fares is 3.65
Starting at 12 total time to earn min avg fares is 3.1
Starting at 13 total time to earn min avg fares is 5.47
Starting at 14 total time to earn min avg fares is 4.45
Starting at 15 total time to earn min avg fares is 3.41
Starting at 16 total time to earn min avg fares is 3.58
Starting at 17 total time to earn min avg fares is 3.99
Starting at 18 total time to earn min avg fares is 3.78
Starting at 19 total time to earn min avg fares is 5.36
Starting at 20 total time to earn min avg fares is 6.58
Starting at 21 total time to earn min avg fares is 4.84
Starting at 22 total time to earn min avg fares is 4.0
Starting at 23 total time to earn min avg fares is 6.76
Best Start hour is 8, taking only 2.56 hours to get to average fare while average time of travel for cab drivers are 6 hours

Fare prediction engine

To apply a ML model we need to select the relevant columns and then apply preprocessing on it

In [17]:
#Read the pickled file, created earlier
df = pd.read_pickle('ultimate_datset.pkl')
df = df[[' pickup_latitude',' pickup_longitude', ' dropoff_latitude', ' dropoff_longitude', 'pickup_hour','pickup_day', ' fare_amount', ' tip_amount']]

days_list = ['Sunday','Monday', 'Tuesday', 'Wednesday', 'Thursday','Friday', 'Saturday']
days_list_dict={}
for i,day in enumerate(days_list):
    days_list_dict[i]=day

#Convert the pickup days into corresponding weekdays
df['day_of_week']= df['pickup_day'].apply(lambda x : x%7).map(days_list_dict)

#Store the data back into a pickle file
df.to_pickle('ML_model.pkl')

Pre-processing steps for data for building the regression model

The pre-processing of the data is done to preprare it for training across various ML models - the steps include doing things like one-hot encoding of categorical variables, normalising numeric variables

Also, for sake of removing outliers and cleansing the data, the observations with 'fare_amount' greater than 100 are ignored as it's clear from Appendix 4 that those values higher than 100 are morelikely to be an outlier

In [ ]:
#Read the pickled file
df = pd.read_pickle('ML_model.pkl')

#Drop the column as it's unnnecessary
df = df.drop(['pickup_day'],axis=1)

#Select the min and max lat,long as was chosen earlier and then selcting the datset 
#such that all the pickups an dropoffs are within the chosen grid
min_lat = 40.65
max_lat = 40.85

min_long = -74.05
max_long = -73.75

df = df[(df[' pickup_latitude']> min_lat) & (df[' pickup_latitude']< max_lat)]
df = df[(df[' pickup_longitude']> min_long) & (df[' pickup_longitude']< max_long)]
df = df[(df[' dropoff_latitude']> min_lat) & (df[' dropoff_latitude']< max_lat)]
df = df[(df[' dropoff_longitude']> min_long) & (df[' dropoff_longitude']< max_long)]

#As seen from the above distribution of fare amount , fare amount above 100 is likely to be an outlier 
#so all observatiosn corresponding to the same has been removed
df = df[df[' fare_amount']<100]


#the columns to be normalised are the ones with numerical features
cols_to_norm = [columns for columns in df.columns if not columns == 'day_of_week']
df[cols_to_norm] = df[cols_to_norm].apply(lambda x: (x - x.mean()) / (x.max() - x.min()))

#convert the only categorical feature -"day_of_week" into a one-hot encodded vector
df = pd.concat([df, pd.get_dummies(df['day_of_week'])], axis=1)

#drop the categorical variable as it's already one-hotencoded
df = df.drop(['day_of_week'],axis=1)

df.to_pickle('Final_ML_model.pkl')
In [3]:
df = pd.read_pickle('Final_ML_model.pkl')

#buidling the final datasets of input and output variables
Y1= df[' fare_amount']
Y2 = df[' tip_amount']
df= df.drop([' fare_amount', ' tip_amount'], axis =1)
#spliit the data into train and test sets, avoided using validation set for lack of time 
X_train, X_test, Y1_train, Y1_test, Y2_train, Y2_test = train_test_split(df, Y1,Y2, 
                                                    random_state = 0,train_size = 0.8)

Prediction Model for predicting fare amount

Check for the best possible score using Ridge Regression

In [5]:
alphas= [1e-3, 1e0, 1e3]
best_score =0 

for alpha in alphas:
    model=Ridge(alpha=alpha)
    model.fit(X_train,Y1_train)
    score = model.score(X_test,Y1_test)
    if score > best_score:
        best_score =score
        best_model = model
        best_alpha = alpha

print ("Best Score : {} was for alpha : {} for Ridge Regression".format(round(best_score, 4),best_alpha))
Best Score : 0.1961 was for alpha : 1.0 for Ridge Regression

Check for the best score using Lasso regression

In [8]:
#alphas= [1e-3, 1e0, 1e3] ----> best_score was 0.1105 for alpha =0.001
#alphas =[1e-6, 1e-4, 1e-2]-----> best score 0.1961 for alpha = 1e-06
alphas =[1e-9, 1e-7, 1e-5]
best_score =0 
for alpha in alphas:
    model=Lasso(alpha=alpha)
    model.fit(X_train,Y1_train)
    score = model.score(X_test,Y1_test)
    if score > best_score:
        best_score =score
        best_model = model
        best_alpha = alpha

print ("Best Score : {} was for alpha : {} for Lasso Regression".format(round(best_score,4),best_alpha))
/home/rohit/anaconda3/lib/python3.5/site-packages/sklearn/linear_model/coordinate_descent.py:466: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations
  ConvergenceWarning)
Best Score : 0.1961 was for alpha : 1e-07 for Lasso Regression

Check for the best model using Elastic Net

In [10]:
alphas =[1e-3, 1e0, 1e3]
l1_ratios = [0.25, 0.5, 0.75]

best_score =0
for l1_ratio in l1_ratios:
    for alpha in alphas:
        model=ElasticNet(alpha=alpha,l1_ratio =l1_ratio)
        model.fit(X_train,Y1_train)
        score = model.score(X_test,Y1_test)
        if score > best_score:
            best_score =score
            best_model = model
            best_alpha = alpha
            best_l1_ratio = l1_ratio 

print ("Best Score : {} was for alpha : {}, l1_ratio :{} for Elastic Net Regression".format(round(best_score,4),best_alpha,best_l1_ratio))
Best Score : 0.183 was for alpha : 0.001, l1_ratio :0.25 for Elastic Net Regression
In [4]:
max_depth_list =[10, 20,40]
min_samples_split_list = [100, 1000, 10000]

best_score =0 

for depth in max_depth_list:
    for split in min_samples_split_list:
        model = DecisionTreeRegressor(max_depth = depth,min_samples_split = split)
        model.fit(X_train, Y1_train)
        score = model.score(X_test, Y1_test)
        if score > best_score:
            best_score =score
            best_decison_tree_model =model
            best_depth = depth
            best_split = split
print ("Best Score : {} for max_depth :{} , min_sample_split : {} for DecisionTree Regresor".format(best_score, best_depth,best_split))
Best Score : 0.8267869531633982 for max_depth :40 , min_sample_split : 1000 for DecisionTree Regresor

Prediction model for predicting tip amount

Check for the best possible score using Ridge Regression

In [11]:
alphas= [1e-3, 1e0, 1e3]
best_score =0 

for alpha in alphas:
    model=Ridge(alpha=alpha)
    model.fit(X_train,Y2_train)
    score = model.score(X_test,Y2_test)
    if score > best_score:
        best_score =score
        best_model = model
        best_alpha = alpha

print ("Best Score : {} was for alpha : {} for Ridge Regression".format(round(best_score, 4),best_alpha))
Best Score : 0.0552 was for alpha : 1.0 for Ridge Regression
In [12]:
alphas= [1e-3, 1e0, 1e3]
best_score =0 

for alpha in alphas:
    model=Lasso(alpha=alpha)
    model.fit(X_train,Y2_train)
    score = model.score(X_test,Y2_test)
    if score > best_score:
        best_score =score
        best_model = model
        best_alpha = alpha

print ("Best Score : {} was for alpha : {} for Lasso Regression".format(round(best_score, 4),best_alpha))
Best Score : 0 was for alpha : 1.0 for Lasso Regression
In [13]:
alphas =[1e-3, 1e0, 1e3]
l1_ratios = [0.25, 0.5, 0.75]

best_score =0
for l1_ratio in l1_ratios:
    for alpha in alphas:
        model=ElasticNet(alpha=alpha,l1_ratio =l1_ratio)
        model.fit(X_train,Y2_train)
        score = model.score(X_test,Y2_test)
        if score > best_score:
            best_score =score
            best_model = model
            best_alpha = alpha
            best_l1_ratio = l1_ratio 

print ("Best Score : {} was for alpha : {}, l1_ratio :{} for Elastic Net Regression".format(round(best_score,4),best_alpha,best_l1_ratio))
Best Score : 0 was for alpha : 1.0, l1_ratio :0.25 for Elastic Net Regression
In [5]:
max_depth_list =[10, 20,40]
min_samples_split_list = [100, 1000, 10000]

best_score =0 

for depth in max_depth_list:
    for split in min_samples_split_list:
        model = DecisionTreeRegressor(max_depth = depth,min_samples_split = split)
        model.fit(X_train, Y2_train)
        score = model.score(X_test, Y2_test)
        if score > best_score:
            best_score =score
            best_decison_tree_model_tip =model
            best_depth = depth
            best_split = split
print ("Best Score : {} for max_depth :{} , min_sample_split : {} for DecisionTree Regresor".format(best_score, best_depth,best_split))
Best Score : 0.25277489152329424 for max_depth :40 , min_sample_split : 1000 for DecisionTree Regresor

Conclusion

Henece, it's pretty clear that Decision Trees work best for prediction of both fare & tip amount prediction due to non-linear relationship among the variables

Appendix

Appendix 1. Snippets for deciding on the grid boundary for rest of the questions

The location of NYC,as per Google, is given 40.71 , -74.00 so ideally the grid should contain the co-ordinates. The objective of grid is chosen such that the least frequency in the region is atleast 1000, centred around the mode.

For same reasons, I plot the ditribution of pickup & dropoff lat & long, to pick boundaries of the grid based on distribution

In [3]:
#Plotting the pickup latitude distribution
min_lat =40
max_lat = 42
trip_cleansed = trip[' pickup_latitude'][(trip[' pickup_latitude']> min_lat) & (trip[' pickup_latitude']< max_lat)]
plt.hist(trip_cleansed,alpha =0.5, bins = 1000)
plt.xlim([min_lat,max_lat])
plt.ylabel('No_of_trips')
plt.xlabel('pickup_latitude')
plt.yscale('log')
plt.savefig('trip_pickup_latitude_overall_distribution.png')
plt.show()
In [53]:
#Plotting the dropoff latitude distribution
min_lat =40
max_lat = 42
trip_cleansed = trip[' dropoff_latitude'][(trip[' dropoff_latitude']> min_lat) & (trip[' dropoff_latitude']< max_lat)]
plt.hist(trip_cleansed,alpha =0.5, bins = 1000)
plt.xlim([min_lat,max_lat])
plt.ylabel('No_of_trips')
plt.xlabel('dropoff_latitude')
plt.yscale('log')
plt.savefig('trip_dropoff_latitude_overall_distribution.png')
plt.show()
In [58]:
#Plotting  the pickup longitude distribution
min_long =-75
max_long = -73
trip_cleansed = trip[' pickup_longitude'][(trip[' pickup_longitude']> min_long) & (trip[' pickup_longitude']< max_long)]
plt.hist(trip_cleansed,alpha =0.5, bins = 1000)
plt.xlim([min_long,max_long])
plt.ylabel('No_of_trips')
plt.xlabel('pickup_longitude')
plt.yscale('log')
plt.savefig('trip_pickup_longitude_overall_distribution.png')
plt.show()
In [58]:
#Plotting  the pickup longitude distribution
min_long =-75
max_long = -73
trip_cleansed = trip[' pickup_longitude'][(trip[' pickup_longitude']> min_long) & (trip[' pickup_longitude']< max_long)]
plt.hist(trip_cleansed,alpha =0.5, bins = 1000)
plt.xlim([min_long,max_long])
plt.ylabel('No_of_trips')
plt.xlabel('pickup_longitude')
plt.yscale('log')
plt.savefig('trip_pickup_longitude_overall_distribution.png')
plt.show()
In [57]:
#Plotting the dropoff longitude distribution
min_long =-75
max_long = -73
trip_cleansed = trip[' dropoff_longitude'][(trip[' dropoff_longitude']> min_long) & (trip[' dropoff_longitude']< max_long)]
plt.hist(trip_cleansed,alpha =0.5, bins = 1000)
plt.xlim([min_long,max_long])
plt.ylabel('No_of_trips')
plt.xlabel('dropoff_longitude')
plt.yscale('log')
plt.savefig('trip_dropoff_longitude_overall_distribution.png')
plt.show()

Final grid boundary decision

As the above distributions point out, the distributions are centred (mode) around approximately 40.75,-73.85, we take regions centred around the same, till the monthly count of trips doesn't fall below 1000(that's just 30 daily trips at the location)

Also, as per research, NYC ~ 800 sq km, and every 1 degree lat ~ 110 km and 1 degree long ~ 90 km at 40 degree lat - we are assuming the above chosen rectangular grid can very weel mimic the dynamics of NYC city to a good extent

In [ ]:
min_lat = 40.65
max_lat = 40.85

min_long = -74.05
max_long = -73.75


df = trip[(trip[' pickup_latitude']> min_lat) & (trip[' pickup_latitude']< max_lat)]
df = df[(df[' pickup_longitude']> min_long) & (df[' pickup_longitude']< max_long)]
df = df[(df[' dropoff_latitude']> min_lat) & (df[' dropoff_latitude']< max_lat)]
df = df[(df[' dropoff_longitude']> min_long) & (df[' dropoff_longitude']< max_long)]

plt.hist(df[' pickup_latitude'],alpha =0.5, bins = 1000)
plt.xlim([min_lat,max_lat])
plt.ylabel('No_of_trips')
plt.xlabel('pickup_latitude_final_grid')
plt.yscale('log')
plt.savefig('trip_pickup_latitude_final_distribution.png')
plt.show()

plt.hist(df[' pickup_longitude'],alpha =0.5, bins = 1000)
plt.xlim([min_long,max_long])
plt.ylabel('No_of_trips')
plt.xlabel('pickup_longitude')
plt.yscale('log')
plt.savefig('trip_pickup_longitude_final_distribution.png')
plt.show()

Appendix 2. The code snippet for determining relation between 'trip_time' & 'fare_amount'

In [102]:
Y=df[' fare_amount'][df[' fare_amount']>0]
X = pd.DataFrame(df[df[' fare_amount']>0][' trip_time_in_secs'])


X['square_x'] = X[' trip_time_in_secs']**2
X['cube_X'] = X[' trip_time_in_secs']**3
X['inverse_X'] = (X[' trip_time_in_secs'].add(.00001))**-1

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, 
                                                    random_state = 0,train_size = 0.8)
alphas= [1e-6,1e-3, 1e0, 1e3]
best_score =0 

for alpha in alphas:
    model=Ridge(alpha=alpha, normalize = True, fit_intercept=True)
    model.fit(X_train,Y_train)
    score = model.score(X_test,Y_test)
    if score > best_score:
        best_score =score
        best_model = model


print (best_score)
print ("Intercept term : {}".format(best_model.intercept_))
for i,items in enumerate(best_model.coef_):
    print ("Coefficient {} : {}".format(X.columns[i], items))
0.74595543337
Intercept term : 2.090110912038803
Coefficient  trip_time_in_secs : 0.011915157943849956
Coefficient square_x : 2.520786065119681e-06
Coefficient cube_X : -5.071416157159069e-10
Coefficient inverse_X : 8.346645901328321e-05

As the coefficients point out, the 'trip_fare' is dependent linearly on 'trip_time'

Appendix 3. The distribution between mean_fare per min and trip_time howing that any value greater than 4 is likely to be an outlier

In [28]:
df = pd.read_pickle(filename)
print (len(df_hourly_rate_per_route))
print (len(df_hourly_rate_per_route['mean_fare_per_min'][df_hourly_rate_per_route['mean_fare_per_min'] < 4 ]))
           
df_hourly_rate_per_route['mean_fare_per_min'].hist(bins=1000)
plt.xlim(0,8)
plt.savefig('mean_fare_rate')
plt.show()
71183
71183

Appendix 4. The plot of 'Fare Amount' distribution and any value higher than 100, likely to be outlier

In [43]:
plt.hist(df[' fare_amount'],bins=1000, alpha =0.5)
plt.yscale('log')
plt.xlabel('fare_amount')
plt.ylabel('no of trips')
plt.title('Distribution of fare')
plt.savefig('Fare amount in the grid.png')
plt.show()

Appendix 5: Seeing difference between traffic flow at different times

In [72]:
df_19 = df[df['pickup_hour']==19]
plt.hist(df_19[' trip_distance'],alpha =0.5, bins = 100)
plt.xlim([0,30])
plt.yscale('log')
plt.ylabel('No of trips')
plt.xlabel('Trip Distance')
plt.title('Distribution of trip distance at 19:00 hrs')
# plt.savefig('tip_amount_distribution.png')
plt.show()
In [74]:
df_12 = df[df['pickup_hour']==12]
plt.hist(df_12[' trip_distance'],alpha =0.5, bins = 100)
plt.xlim([0,30])
plt.yscale('log')
plt.ylabel('No of trips')
plt.xlabel('Trip Distance')
plt.title('Distribution of trip distance at 12:00 hours')
# plt.savefig('tip_amount_distribution.png')
plt.show()
In [ ]: