#Do the imports at the start of the script
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
import sys
from sklearn.linear_model import Ridge, Lasso,ElasticNet
from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeRegressor
import numpy as np
#Read the csv data files
trip = pd.read_csv('trip_data_4.csv')
fare = pd.read_csv('trip_fare_4.csv')
#Capture the date of pickup from the DateTimeStamp of pickup_time column
trip['pickup_day'] = pd.DatetimeIndex(trip[' pickup_datetime']).day
trip.to_pickle('trip_pickup_day.pkl')
print (len(fare))
fare.head()
print (len(trip))
trip.head()
#Extracting frequency of passeneger in a cab into 2 lists - index and respective counts
#Plotting the counts and setting the corresponding xticks as the index
passenger_count = trip[' passenger_count'].value_counts().sort_index()
passenger_count_list_index = [index for index in passenger_count.index]
passenger_count_list_values =[passenger_count[index] for index in passenger_count.index]
no_of_types = range(len(passenger_count))
plt.xticks([x+0.5 for x in no_of_types],passenger_count_list_index) #Adding 0.5 to place the xticks at middle of bar
plt.bar(no_of_types,passenger_count_list_values, alpha=0.5) #Discrete variable, hence bar chart
plt.yscale('log') #Adding log scale, as the values fluctuate widely
plt.ylabel('No of trips')
plt.xlabel('No of passengers in cab')
plt.title('Distribution of passenger_counts')
plt.savefig('trip_passenger_count_distribution.png')
plt.show()
#Extracting frequency of mode of payment in a cab into 2 lists - index and respective counts
#Plotting the counts and setting the corresponding xticks as the index
payment_type = fare[' payment_type'].value_counts()
payment_type_list_index = [index for index in payment_type.index]
payment_type_list_values =[payment_type[index] for index in payment_type.index]
no_of_types = range(len(payment_type))
plt.xticks([x+0.5 for x in no_of_types],payment_type_list_index) #Adding 0.5 to place the xticks at middle of bar
plt.bar(no_of_types,payment_type_list_values, alpha=0.5) # Discrete variable, hence bar chart
plt.yscale('log')
plt.ylabel('No of trips')
plt.xlabel('Payment modes')
plt.title('Distribution of payment_types')
plt.savefig('fare_payment_type_distribution.png')
plt.show()
#Plot the histogram of fare amount frequqncy with min and max values as limit
#Continuous variable , hence a histogram being plotted
plt.hist(fare[' fare_amount'],alpha =0.5, bins = 100)
plt.xlim([fare[' fare_amount'].min(),fare[' fare_amount'].max()])
plt.yscale('log') #log as the values were ranging across large range
plt.ylabel('No of trips')
plt.xlabel('Fare amount')
plt.title('Distribution of fare')
plt.savefig('fare_amount_distribution.png')
plt.show()
#Plot the histogram of ftip amount frequqncy with min and max values as limit
#Continuous variable , hence a histogram being plotted
plt.hist(fare[' tip_amount'],alpha =0.5, bins = 100)
plt.xlim([fare[' tip_amount'].min(),fare[' tip_amount'].max()])
plt.yscale('log')
plt.ylabel('No of trips')
plt.xlabel('Tip amount')
plt.title('Distribution of tips')
plt.savefig('tip_amount_distribution.png')
plt.show()
#Plot the histogram of total amount frequqncy with min and max values as limit
#Continuous variable , hence a histogram being plotted
plt.hist(fare[' total_amount'],alpha =0.5, bins = 100)
plt.xlim([fare[' total_amount'].min(),fare[' total_amount'].max()])
plt.yscale('log')
plt.ylabel('Frequency of total_amount')
plt.title('Distribution of total_amount')
plt.savefig(' total_amount_distribution.png')
plt.show()
#Assumption: Busiest hour would imply both pickups and dropoffs are highest
#First generating pickup and dropoff hours based on timestamp and then generating frequency of pickup & dropof
#Discrete variable , hence a bar chart being plotted
trip['pickup_hour'] = trip[' pickup_datetime'].map( lambda x: pd.to_datetime(x).hour )
trip['dropoff_hour'] = trip[' dropoff_datetime'].map( lambda x: pd.to_datetime(x).hour )
#Combine pickup & dropoff frequency per hour
time_distribution = trip['pickup_hour'].value_counts().add(trip['dropoff_hour'].value_counts())
time_distribution.plot.bar(alpha=0.5)
print ("The top 5 busiest hours in descending order are")
busiest_hours = time_distribution.sort_values(ascending=False).index[:5].tolist() #Sort the series based on frequency
print (busiest_hours)
plt.ylabel('No. of trips')
plt.title('Distribution of trips - hourly basis')
plt.savefig('time_distribution.png')
plt.show()
Working with the entire dataset was becoming tough, with processing times for each snippets running into hours - hence I took the relevant columns from fare dataset and concatenated it with the trip dataset as those columsn from fare datset were the only relevant columns I was going to use henceforth
trip = pd.read_pickle('trip_pickup_day.pkl')
#Getting the trip date, from DateTimeStamp
print ("Getting the hour data of the pickups.....")
trip['pickup_hour'] = trip[' pickup_datetime'].map( lambda x: pd.to_datetime(x).hour )
print ("Reading the fare csv file.....")
fare = pd.read_csv('trip_fare_4.csv')
fare = fare[[' fare_amount', ' tip_amount']]
#Merging the columns fare amount and tip amount, as they are only relevant columns from fare dataset
print ("Concatenating the fare & tip amount to the original trip dataset")
trip_new = pd.concat([trip, fare], axis=1, join_axes=[trip.index])
print("Saving the final datset")
trip_new.to_pickle('ultimate_datset.pkl')
The processing time was extremely high for any operation as the dataset was huge. For rest of part, with loss of some amount of generality, I choose to select a single day for representation of the month. Evidently it's biased as seen from the distribution plotted below, but nonetheless,operating with the huge dataset would've been tough on this 2GB laptop, hence 1st April, 2013, with least rides in the month (refer Appendix) has been chosen for references hereon
#Read data and choose the day with least records to save processing time
trip = pd.read_pickle('ultimate_datset.pkl')
trip['pickup_day'].value_counts().plot.bar(alpha=0.5)
plt.savefig('day_of_month_dist.png')
plt.show()
#Choosing Monday, though the inherent bias is evident as the Mondays
#seems to have the lowest pickups in the entire month
trip_1=trip[trip['pickup_day']==1]
trip_1.to_pickle('trip_1.pkl')
For answering rest of basic questions on trips and locations (join distribution of lat & long), I choose to superimpose a grid on the map of NYC, dividing the regions into small squares.
The location of NYC,as per Google, is given 40.71 , -74.00 so ideally the grid should contain the co-ordinates. The objective of grid is chosen such that the least frequency in the region is atleast 1000, centred around the mode. As per distributions plotted in Appendix 1, we choose the grid boundaries as [40.65,40.85] for latitude [-74.05, -73.75] for longitude
Also, as per research, NYC ~ 800 sq km, and every 1 degree lat ~ 110 km and 1 degree long ~ 90 km at 40 degree lat - we are assuming the above chosen rectangular grid can very weel mimic the dynamics of NYC city to a good extent
class grid_network:
def __init__(self,lat = (40.85,40.65), long = (-73.75,-74.05),x_space=0.02, y_space=0.02):
#boundaries of the grid and spacing are stored as class variables
self.lat_1 = min(lat)
self.lat_2 = max(lat)
self.long_1 = min(long)
self.long_2 = max(long)
self.x_space = x_space
self.y_space = y_space
# functions to convert the co-ordinates to their pickup and dropoof regions, storing them as columns
def pickup_region(self,df):
lat_reg= ((df[' pickup_latitude'] - self.lat_1)//self.y_space).astype(int)
long_reg = ((df[' pickup_longitude'] - self.long_1)//self.x_space).astype(int)
df['pickup_reg'] = lat_reg.astype(str) + ',' + long_reg.astype(str)
return df
def dropoff_region(self,df):
lat_reg= ((df[' dropoff_latitude'] - self.lat_1)//self.y_space).astype(int)
long_reg = ((df[' dropoff_longitude'] - self.long_1)//self.x_space).astype(int)
df['dropoff_reg'] = lat_reg.astype(str) + ',' + long_reg.astype(str)
return df
# function to convert region to lat,long co-ordinates of the centre of the small square
def reg_to_coord(self, reg):
lat = round((int(reg.split(',')[-2])+0.5)*self.y_space + self.lat_1,3)
long = round((int(reg.split(',')[-1])+0.5)*self.x_space + self.long_1,3)
return '{},{}'.format(lat,long)
#function to get the busy regions of the city and convert it back to lat, long
#busy would mean dropoff and pickups are both high
#passing val =10 in the function to choose top 10 busy locations
def get_busy_locations(self,df,val=10):
locations = (df['dropoff_reg'].value_counts()).add(df['pickup_reg'].value_counts(), fill_value =0)
print("The top {} busiest locations are ".format(val))
for key in locations.sort_values(axis=0, ascending = False).keys()[:val]:
print (self.reg_to_coord(key), key, locations[key])
# function that gets the trips with the least sigma of travel time
def get_region_with_highest_sigma_time(self,df):
df_region_with_highest_sigma = df.groupby(['trip_route']).agg({' trip_time_in_secs':'std', 'trip_route':'count'})
#applying a filter of 10 trips so that otlier datapoints aren't considered for packing
df_region_with_highest_sigma = df_region_with_highest_sigma[df_region_with_highest_sigma['trip_route']>10]
item = df_region_with_highest_sigma.sort_values(' trip_time_in_secs', ascending =False).index[0]
keys = df_region_with_highest_sigma[' trip_time_in_secs'].loc[item]
print ("The trip with max sigma in time : route {},pickup at {} and drop_off at {} and is {}s".format(item,
self.reg_to_coord(item.split('-')[-2]),self.reg_to_coord(item.split('-')[-1]),round(keys,2)))
#function that gets the trips with least sigma of fares
def get_region_with_consistent_fare(self,df):
df_consistent_fares = df.groupby(['trip_route']).agg({' fare_amount':'std', 'trip_route':'count'})
#applying a filter of 10 trips so that otlier datapoints aren't considered for packing
df_consistent_fares = df_consistent_fares[df_consistent_fares['trip_route']>10]
item = df_consistent_fares.sort_values(' fare_amount',ascending = True).index[0]
keys = df_consistent_fares[' fare_amount'].loc[item]
print ("The trip with min variation in fare : route {},pickup at {} and drop_off at {} and is {}".format(item,
self.reg_to_coord(item.split('-')[-2]),self.reg_to_coord(item.split('-')[-1]),round(keys,2)))
# function to create the regions
def create_regions(self,df):
print ("Creating borders..")
#selcting the dataframe so that all pickups and dropoffs lie withing the boundaries of grid
df = df[(df[' pickup_longitude'] > self.long_1) & (df[' pickup_longitude'] < self.long_2)]
df = df[(df[' pickup_latitude'] > self.lat_1) & (df[' pickup_latitude'] < self.lat_2)]
df = df[(df[' dropoff_longitude'] > self.long_1) & (df[' dropoff_longitude'] < self.long_2)]
df = df[(df[' dropoff_latitude'] > self.lat_1) & (df[' dropoff_latitude'] < self.lat_2)]
df = self.pickup_region(df)
df = self.dropoff_region(df)
#define trip route as string 'region_pickup-region_dropoff"
df['trip_route'] = df['pickup_reg'] + '-' + df['dropoff_reg']
#checking no of trips with source and sink as different regions
#ideally this should be high so as to capture dynamics within regions
print ("No of trips to another region : {:7d}".format(len(df.index[df['pickup_reg']!=df['dropoff_reg']])))
#In goal to maximie the above quantity, regions could've been so small
#that there are not enough trips in any region to draw conclusions
print ("No. of trips with frequency greater than 5 in day :{} ".format(sum(df['trip_route'].value_counts()>5)))
#store the processed data
filename = 'df_full_x_space_{}_y_space{}.pkl'.format(10*self.x_space,10*self.y_space)
df.to_pickle(filename)
return df,filename
To do this you need to have executed the last block of code Creating grid-based regions
with warnings.catch_warnings():
warnings.simplefilter("ignore")
df = pd.read_pickle('trip_1.pkl') #read the previously pickled file
trip = df # create another copy of the dataframe
grid = grid_network(x_space = 0.01, y_space = 0.01) #initialise the class
trip,filename = grid.create_regions(trip) #create grid
grid.get_busy_locations(trip) #get the top 10 busy locations
grid.get_region_with_highest_sigma_time(trip) # get the trip route with highest sigma in travel time
grid.get_region_with_consistent_fare(trip) # get the route with least sigma in fares
print (filename) #print the filename for the next part of problem
To do this you need to have executed
The nature of the relationship between trip fare and trip time needed to be established before proceeeding with the problem statement.
The model seems linear,at least from a day's data, so we go for the linear model data ( See Appendix 2 for more details)
As it turns out the relationship is linear, so we calculate "mean fare per min travelled" for each hour for each source location and sink location.
This the plot of distribution of "mean fare per min travelled". If done more finely, it's observed that "mean fare per min travelled can hardly go beyond 4, so applying that filter on "mean fare per min travelled" makes sense. (See Appendix 3 for the plot)
filename = 'NYC_Cabs_Data Challenge/df_full_x_space_0.1_y_space0.1.pkl'
df = pd.read_pickle(filename)
df_hourly_rate_per_route = df.groupby(['pickup_hour','pickup_reg','dropoff_reg']).agg({' trip_time_in_secs':'mean',' fare_amount':'mean','dropoff_reg':'count' })
#replace the trip times with 0 secs (very likely a wrong data) with a high no
#so that mean fare per min is a small no and isn't considered for optimisation
df_hourly_rate_per_route[' trip_time_in_secs']=df_hourly_rate_per_route[' trip_time_in_secs'].replace(0,10000)
df_hourly_rate_per_route['mean_fare_per_min']= (df_hourly_rate_per_route[' fare_amount']*60).divide(df_hourly_rate_per_route[' trip_time_in_secs'])
#the dataframe for calculating the avg trip times for each hour, and for each pair ofpickup and dropoff regions
df_avg_time_routes = df.groupby(['pickup_reg','pickup_hour','dropoff_reg'])[' trip_time_in_secs'].agg({'mean_travel_time' : 'mean'})
#the dataframe for calculating demand for each region at each hour, by counting the no of pickups at the region
df_demand_hourly_regions = df.groupby(['pickup_hour', 'pickup_reg'])['pickup_reg'].agg({'avg_hourly_demand': 'count'})
#the "mean fare per min travelled" parameter filtered of outlier datapoints by applying filter value of 4
df_hourly_rate_per_route['mean_fare_per_min'] = df_hourly_rate_per_route['mean_fare_per_min'].apply(lambda x: x if x < 4 else 0)
def select_next_opt_trip(start_time, start_reg = None, end_time=23,total_fare=0,trip_no=0, start=True,next_day=False, verbose = 1):
#flag to make sure if driving onto next_day, end_time > start_time
if next_day:
end_time = end_time +24
next_day =False
#flaf to make sure start_time is made to 0, as soon as it's past 23:59
if start_time >= 24:
start_time= start_time -24
end_time = end_time - 24
#if the flag is start, we start with region which has the highest "mean fare per min" at that hour
if start:
start_hour =int(start_time)
start_reg = df_hourly_rate_per_route['mean_fare_per_min'].loc[start_hour].idxmax()[0]
if verbose > 0 :
print ("He starts in the region {}, [{}] at {}".format(start_reg,grid.reg_to_coord(start_reg),start_time))
start=False
# check for the start_time is lesser than end_time - the stopping criterion
if start_time < end_time:
if round(start_time)==24: #to make sure that after 23:30 hrs, start_hour isn't made to be 24
start_hour =0
else:
start_hour = round(start_time) # if round(start_time) isn't 24 , round it to nearest integer
#get the list of possible dropoff regions at the start location at that hour
df = df_avg_time_routes.loc[start_reg].loc[start_hour]
#get the avg time for arrival at each of possible dropoff locations
df['hour_arrival_dropoff'] = round(df['mean_travel_time'].divide(3600) + start_time).astype(int)
df['expected_demand_arrival_flag'] =0
df['fare_per_min'] =0
#get the demand at each dropoff location at the corresponing time of arrival & check if demand > threshold
#threshold is calculated based on median of demands at that hour across all regions
for i in range(len(df)):
try:
threshold = df_demand_hourly_regions.loc[df['hour_arrival_dropoff'].iloc[i]]['avg_hourly_demand'].median()*0.2
df['expected_demand_arrival_flag'].iloc[i] = df_demand_hourly_regions.loc[df['hour_arrival_dropoff'].iloc[i]].loc[df.index[i]][0] > threshold
except:
continue
#get the "mean fare per min" parameter for all possible dropoff locations
for i in range(len(df)):
try:
df['fare_per_min'].iloc[i] = df_hourly_rate_per_route.loc[df['hour_arrival_dropoff'].iloc[i]].loc[start_reg].loc[df.index[i]]['mean_fare_per_min']
except:
continue
#calculate optimum score as product of maximising parameter - "mean fare per min" & demand flag
df['optimum_score']=df['fare_per_min'].multiply(df['expected_demand_arrival_flag'])
#see if the best optimum score exists then keep on recursively checking for next rides
try:
dropoff_reg = df['optimum_score'].idxmax()
#if the best score is 0, this implies likely that demand in the best_possible dropoff region is below threshold
#so search for the nearest best region in the neighbourhood
if df['optimum_score'].max() ==0:
if verbose > 0 :
print ("Demand is below the threshold, so searching around..")
adj_reg,travel_time = search_adjacent_region(start_reg,start_time,verbose = verbose)
start_time += travel_time
if verbose > 0 :
print ("Starting journey from adjacent region {} at {}:{:2.0f}".format(adj_reg, int(start_time),
(start_time-int(start_time))*60))
return select_next_opt_trip(start_time,adj_reg,end_time, total_fare,trip_no,start,next_day,verbose)
else:
#Calculate expected dropoff time , fare and keep on recursively choosing the best trip
dropoff_time = round(start_time + (df['mean_travel_time'].loc[dropoff_reg])/3600,2) + 0.15 # added 0.05 hrs ~ 3 min to account for fare collection and next booking pickup
trip_no+=1
#calculate expected fare amount
expected_mean_fare=round(df_hourly_rate_per_route.loc[start_hour].loc[start_reg].loc[dropoff_reg][' fare_amount'],2)
total_fare+=expected_mean_fare
if verbose > 0 :
print ('Trip {}: Next dropoff is at {}, [{}] and at {}:{:2.0f}, expected_fare : {:4.2f}'.format(trip_no,
dropoff_reg,
grid.reg_to_coord(dropoff_reg),
int(dropoff_time),
round((dropoff_time-int(dropoff_time))*60),
expected_mean_fare))
return select_next_opt_trip(dropoff_time, dropoff_reg,end_time, total_fare,trip_no,start,next_day,verbose)
except:
#if there's no demand at the current pickup region, need to search nearest region for a good pickup
if verbose > 0 :
print ("No demand for cab at region {}".format(start_reg))
adj_reg,travel_time = search_adjacent_region(start_reg,start_time,verbose = verbose)
start_time += travel_time
if verbose > 0 :
print ("Starting journey from adjacent region {} at {}:{:2.0f}".format(adj_reg, int(start_time),
(start_time-int(start_time))*60))
return select_next_opt_trip(start_time,adj_reg,end_time, total_fare,trip_no,start,next_day,verbose)
else:
if verbose > 0 :
print ("It's already {}, time to pack".format(end_time))
print ("Already earnt fare amount {} for the day!".format(round(total_fare,2)))
return total_fare
def search_adjacent_region(reg,time,degree=3,verbose =1 ):
if verbose > 0 :
print ("Searching adjacent regions around {} at {}:{:2.0f}....".format(reg,int(time),(time-int(time))*60))
time = int(time)
best_reg = None
best_mean_fare = -1
#vals is the variable indication degree of search
#if it's 1 degree search , just search adjacent 8 squares for best "mean fare per min" parameter alongwith demand as well
#search all degrees till we don't find a region with demand
for vals in range(1,degree):
reg_1 = int(reg.split(',')[-2])
reg_2 = int(reg.split(',')[-1])
reg_list =[]
# form a list of all the adjacent regions to search on in ith degree search
for i in range(-vals,vals+1):
for j in range(-vals,vals+1):
if (reg_1+i >= 0) & (reg_2+j >=0):
reg_element = str(reg_1+i) +',' +str(reg_2+j)
reg_list.append(reg_element)
#for all the regions to be searched in the ith degree check for the best " mean fare per min" if there's demand
for regs in reg_list:
try:
mean_fare = df_hourly_rate_per_route['mean_fare_per_min'].loc[time].loc[regs].max()
travel_time_list = df_avg_time_routes['mean_travel_time'].loc[(df_avg_time_routes.index.get_level_values('dropoff_reg')==reg) &
(df_avg_time_routes.index.get_level_values('dropoff_reg') == regs)]
if (mean_fare > best_mean_fare) & (regs!=reg):
best_mean_fare = mean_fare
best_reg = regs
best_travel_time_list = travel_time_list
except:
#if there's no demand at the region being searched , cotinue searching the other regions
continue
# if at the end of searching all regions in the ith degree search
#even if there's one region with demand, we move there
#if there are more than one, choose the one with best "mean fare per min"
#if there's no region with demand, increase the degree of search
if best_reg is not None:
if len(best_travel_time_list) > 0:
best_travel_time= (best_travel_time_list.mean())/3600
else:
#if there's not any data to predict the average time of travel between current region and adjacent 'best' region
#take an average time of travel to the adjacent region as 6*degree of search mins (based on observations)
if verbose > 0 :
print("Not enough data to get mean travel time to {} from {}, approx_time : {}".format(reg,best_reg,6*vals))
best_travel_time = 0.1
if verbose > 0 :
print ("Nearest Best Region found at {} at distance of {} mins ".format(best_reg, round(best_travel_time*60,2)))
return best_reg, round(best_travel_time,2)
else:
if verbose > 0 :
print("No good region found near region {} in {} degree search".format(reg,vals))
continue
You can mention the start_time and end_time and print the routes the cab driver takes
Also, if you want, you can select a custom location where the cab needs to start, with flag 'start' set to False. If the flag is set to False, the code doesn't search for the best location to start from at that hour
To do this you need to have executed
The snippet of code to execute the maximum earnings given the following parameters
with warnings.catch_warnings():
warnings.simplefilter("ignore")
grid = grid_network(x_space = 0.01, y_space = 0.01)
select_next_opt_trip(23,end_time = 1, next_day=True, verbose = 1) #use differet parameters as per requirement
We would see that median average time for every driver is 6 hours. The snippet calculates the optimised for revenue for a driver driving for 6 hours starting at diffeent times of the day
with warnings.catch_warnings():
warnings.simplefilter("ignore")
fares ={}
for t in range(0,24):
if t+6 > 24:
next_day=True
end_time = (t+6) - 24
else:
next_day = False
end_time = (t+6)
fares[t] = select_next_opt_trip(t,end_time = end_time, next_day=next_day,verbose = 0)
plt.bar(range(len(fares)), fares.values(), align='center',alpha =0.5)
plt.xticks(range(len(fares)), fares.keys())
plt.show()
df_driver = df.groupby([' hack_license']).agg({' fare_amount':'sum', 'pickup_hour': lambda x:np.max(x)-np.min(x) if np.min(x) > 8 else -2})
df_driver = df_driver[df_driver['pickup_hour']>0]
print (df_driver['pickup_hour'].median())
print (df_driver[' fare_amount'].median())
def select_next_opt_trip_with_min_time(start_time, start_reg = None,total_fare=0,trip_no=0,total_time=0,start=True):
#check if the start_time is within 23:59 hours, adjust accordingly if it shoots out
if start_time >= 24:
start_time= start_time -24
#check if it's start of the trip to select the location automatically
if start:
start_hour = round(start_time)
start_reg = df_hourly_rate_per_route['mean_fare_per_min'].loc[start_hour].idxmax()[0]
#check if the stopping criteria is reached or not
if total_fare < 177.5 :
if round(start_time)==24: #to make sure that after 23:30 hrs, start_hour isn't made to be 24
start_hour = 0
else:
start_hour =round(start_time)# if round(start_time) isn't 24 , round it to nearest integer
#get the list of possible dropoff regions at the start location at that hour
df = df_avg_time_routes.loc[start_reg].loc[start_hour]
#get the avg time for arrival at each of possible dropoff locations
df['hour_arrival_dropoff'] = round(df['mean_travel_time'].divide(3600) + start_time).astype(int)
df['expected_demand_arrival_flag'] =0
df['fare_per_min'] =0
#get the demand at each dropoff location at the corresponing time of arrival & check if demand > threshold
#threshold is calculated based on median of demands at that hour across all regions
for i in range(len(df)):
try:
threshold = df_demand_hourly_regions.loc[df['hour_arrival_dropoff'].iloc[i]]['avg_hourly_demand'].median()*0.2
df['expected_demand_arrival_flag'].iloc[i] = df_demand_hourly_regions.loc[df['hour_arrival_dropoff'].iloc[i]].loc[df.index[i]][0] > threshold
except:
continue
#get the "mean fare per min" parameter for all possible dropoff locations
for i in range(len(df)):
try:
df['fare_per_min'].iloc[i] = df_hourly_rate_per_route.loc[df['hour_arrival_dropoff'].iloc[i]].loc[start_reg].loc[df.index[i]]['mean_fare_per_min']
except:
continue
#calculate optimum score as product of maximising parameter - "mean fare per min" & demand flag
df['optimum_score']=df['fare_per_min'].multiply(df['expected_demand_arrival_flag'])
#see if the best optimum score exists then keep on recursively checking for next rides
try:
dropoff_reg = df['optimum_score'].idxmax()
#if the best score is 0, this implies likely that demand in the best_possible dropoff region is below threshold
#so search for the nearest best region in the neighbourhood
if df['optimum_score'].max() ==0:
adj_reg,travel_time = search_adjacent_region(start_reg,start_time)
start_time += travel_time
total_time += travel_time
#Calculate expected dropoff time , fare and keep on recursively choosing the best trip
return select_next_opt_trip_with_min_time(start_time,adj_reg, total_fare,trip_no,total_time,start=False)
else:
dropoff_time = round(start_time + (df['mean_travel_time'].loc[dropoff_reg])/3600,2) + 0.15
total_time+= dropoff_time-start_time
trip_no+=1
#calculate expected fare amount
expected_mean_fare=round(df_hourly_rate_per_route.loc[start_hour].loc[start_reg].loc[dropoff_reg][' fare_amount'],2)
total_fare+=expected_mean_fare
return select_next_opt_trip_with_min_time(dropoff_time, dropoff_reg, total_fare,trip_no,total_time,start=False)
except:
#if there's no demand at the current pickup region, need to search nearest region for a good pickup
adj_reg,travel_time = search_adjacent_region(start_reg,start_time)
start_time += travel_time
total_time += travel_time
return select_next_opt_trip_with_min_time(start_time,adj_reg,total_fare,trip_no,total_time,start=False)
else:
return total_time
def search_adjacent_region(reg,time,degree=3 ):
time = int(time)
best_reg = None
best_mean_fare = -1
#vals is the variable indication degree of search
#if it's 1 degree search , just search adjacent 8 squares for best "mean fare per min" parameter alongwith demand as well
#search all degrees till we don't find a region with demand
for vals in range(1,degree):
reg_1 = int(reg.split(',')[-2])
reg_2 = int(reg.split(',')[-1])
reg_list =[]
# form a list of all the adjacent regions to search on in ith degree search
for i in range(-vals,vals+1):
for j in range(-vals,vals+1):
if (reg_1+i >= 0) & (reg_2+j >=0):
reg_element = str(reg_1+i) +',' +str(reg_2+j)
reg_list.append(reg_element)
#for all the regions to be searched in the ith degree check for the best " mean fare per min" if there's demand
for regs in reg_list:
try:
mean_fare = df_hourly_rate_per_route['mean_fare_per_min'].loc[time].loc[regs].max()
travel_time_list = df_avg_time_routes['mean_travel_time'].loc[(df_avg_time_routes.index.get_level_values('pickup_reg')==reg) &
(df_avg_time_routes.index.get_level_values('dropoff_reg') == regs)]
if (mean_fare > best_mean_fare) & (regs!=reg):
best_mean_fare = mean_fare
best_reg = regs
best_travel_time_list = travel_time_list
except:
#if there's no demand at the region being searched , cotinue searching the other regions
continue
#if at the end of searching all regions in the ith degree search
#even if there's one region with demand, we move there
#if there are more than one, choose the one with best "mean fare per min"
#if there's no region with demand, increase the degree of search
if best_reg is not None:
if len(best_travel_time_list) > 0:
best_travel_time= (best_travel_time_list.mean())/3600
else:
#if there's not any data to predict the average time of travel between current region and adjacent 'best' region
#take an average time of travel to the adjacent region as 6*degree of search mins (based on observations)
best_travel_time = 0.1
return best_reg, round(best_travel_time,2)
else:
continue
with warnings.catch_warnings():
warnings.simplefilter("ignore")
best_total_time = 24
for i in range(24):
total_time = select_next_opt_trip_with_min_time(i)
print ("Starting at {} total time to earn min avg fares is {}".format(i,round(total_time,2)))
if total_time < best_total_time:
best_total_time = total_time
best_start_hour = i
print ("Best Start hour is {}, taking only {} hours to get to average fare while average time of travel for cab drivers are 6 hours".format(best_start_hour, round(best_total_time,2)))
To apply a ML model we need to select the relevant columns and then apply preprocessing on it
#Read the pickled file, created earlier
df = pd.read_pickle('ultimate_datset.pkl')
df = df[[' pickup_latitude',' pickup_longitude', ' dropoff_latitude', ' dropoff_longitude', 'pickup_hour','pickup_day', ' fare_amount', ' tip_amount']]
days_list = ['Sunday','Monday', 'Tuesday', 'Wednesday', 'Thursday','Friday', 'Saturday']
days_list_dict={}
for i,day in enumerate(days_list):
days_list_dict[i]=day
#Convert the pickup days into corresponding weekdays
df['day_of_week']= df['pickup_day'].apply(lambda x : x%7).map(days_list_dict)
#Store the data back into a pickle file
df.to_pickle('ML_model.pkl')
The pre-processing of the data is done to preprare it for training across various ML models - the steps include doing things like one-hot encoding of categorical variables, normalising numeric variables
Also, for sake of removing outliers and cleansing the data, the observations with 'fare_amount' greater than 100 are ignored as it's clear from Appendix 4 that those values higher than 100 are morelikely to be an outlier
#Read the pickled file
df = pd.read_pickle('ML_model.pkl')
#Drop the column as it's unnnecessary
df = df.drop(['pickup_day'],axis=1)
#Select the min and max lat,long as was chosen earlier and then selcting the datset
#such that all the pickups an dropoffs are within the chosen grid
min_lat = 40.65
max_lat = 40.85
min_long = -74.05
max_long = -73.75
df = df[(df[' pickup_latitude']> min_lat) & (df[' pickup_latitude']< max_lat)]
df = df[(df[' pickup_longitude']> min_long) & (df[' pickup_longitude']< max_long)]
df = df[(df[' dropoff_latitude']> min_lat) & (df[' dropoff_latitude']< max_lat)]
df = df[(df[' dropoff_longitude']> min_long) & (df[' dropoff_longitude']< max_long)]
#As seen from the above distribution of fare amount , fare amount above 100 is likely to be an outlier
#so all observatiosn corresponding to the same has been removed
df = df[df[' fare_amount']<100]
#the columns to be normalised are the ones with numerical features
cols_to_norm = [columns for columns in df.columns if not columns == 'day_of_week']
df[cols_to_norm] = df[cols_to_norm].apply(lambda x: (x - x.mean()) / (x.max() - x.min()))
#convert the only categorical feature -"day_of_week" into a one-hot encodded vector
df = pd.concat([df, pd.get_dummies(df['day_of_week'])], axis=1)
#drop the categorical variable as it's already one-hotencoded
df = df.drop(['day_of_week'],axis=1)
df.to_pickle('Final_ML_model.pkl')
df = pd.read_pickle('Final_ML_model.pkl')
#buidling the final datasets of input and output variables
Y1= df[' fare_amount']
Y2 = df[' tip_amount']
df= df.drop([' fare_amount', ' tip_amount'], axis =1)
#spliit the data into train and test sets, avoided using validation set for lack of time
X_train, X_test, Y1_train, Y1_test, Y2_train, Y2_test = train_test_split(df, Y1,Y2,
random_state = 0,train_size = 0.8)
Check for the best possible score using Ridge Regression
alphas= [1e-3, 1e0, 1e3]
best_score =0
for alpha in alphas:
model=Ridge(alpha=alpha)
model.fit(X_train,Y1_train)
score = model.score(X_test,Y1_test)
if score > best_score:
best_score =score
best_model = model
best_alpha = alpha
print ("Best Score : {} was for alpha : {} for Ridge Regression".format(round(best_score, 4),best_alpha))
Check for the best score using Lasso regression
#alphas= [1e-3, 1e0, 1e3] ----> best_score was 0.1105 for alpha =0.001
#alphas =[1e-6, 1e-4, 1e-2]-----> best score 0.1961 for alpha = 1e-06
alphas =[1e-9, 1e-7, 1e-5]
best_score =0
for alpha in alphas:
model=Lasso(alpha=alpha)
model.fit(X_train,Y1_train)
score = model.score(X_test,Y1_test)
if score > best_score:
best_score =score
best_model = model
best_alpha = alpha
print ("Best Score : {} was for alpha : {} for Lasso Regression".format(round(best_score,4),best_alpha))
Check for the best model using Elastic Net
alphas =[1e-3, 1e0, 1e3]
l1_ratios = [0.25, 0.5, 0.75]
best_score =0
for l1_ratio in l1_ratios:
for alpha in alphas:
model=ElasticNet(alpha=alpha,l1_ratio =l1_ratio)
model.fit(X_train,Y1_train)
score = model.score(X_test,Y1_test)
if score > best_score:
best_score =score
best_model = model
best_alpha = alpha
best_l1_ratio = l1_ratio
print ("Best Score : {} was for alpha : {}, l1_ratio :{} for Elastic Net Regression".format(round(best_score,4),best_alpha,best_l1_ratio))
max_depth_list =[10, 20,40]
min_samples_split_list = [100, 1000, 10000]
best_score =0
for depth in max_depth_list:
for split in min_samples_split_list:
model = DecisionTreeRegressor(max_depth = depth,min_samples_split = split)
model.fit(X_train, Y1_train)
score = model.score(X_test, Y1_test)
if score > best_score:
best_score =score
best_decison_tree_model =model
best_depth = depth
best_split = split
print ("Best Score : {} for max_depth :{} , min_sample_split : {} for DecisionTree Regresor".format(best_score, best_depth,best_split))
Check for the best possible score using Ridge Regression
alphas= [1e-3, 1e0, 1e3]
best_score =0
for alpha in alphas:
model=Ridge(alpha=alpha)
model.fit(X_train,Y2_train)
score = model.score(X_test,Y2_test)
if score > best_score:
best_score =score
best_model = model
best_alpha = alpha
print ("Best Score : {} was for alpha : {} for Ridge Regression".format(round(best_score, 4),best_alpha))
alphas= [1e-3, 1e0, 1e3]
best_score =0
for alpha in alphas:
model=Lasso(alpha=alpha)
model.fit(X_train,Y2_train)
score = model.score(X_test,Y2_test)
if score > best_score:
best_score =score
best_model = model
best_alpha = alpha
print ("Best Score : {} was for alpha : {} for Lasso Regression".format(round(best_score, 4),best_alpha))
alphas =[1e-3, 1e0, 1e3]
l1_ratios = [0.25, 0.5, 0.75]
best_score =0
for l1_ratio in l1_ratios:
for alpha in alphas:
model=ElasticNet(alpha=alpha,l1_ratio =l1_ratio)
model.fit(X_train,Y2_train)
score = model.score(X_test,Y2_test)
if score > best_score:
best_score =score
best_model = model
best_alpha = alpha
best_l1_ratio = l1_ratio
print ("Best Score : {} was for alpha : {}, l1_ratio :{} for Elastic Net Regression".format(round(best_score,4),best_alpha,best_l1_ratio))
max_depth_list =[10, 20,40]
min_samples_split_list = [100, 1000, 10000]
best_score =0
for depth in max_depth_list:
for split in min_samples_split_list:
model = DecisionTreeRegressor(max_depth = depth,min_samples_split = split)
model.fit(X_train, Y2_train)
score = model.score(X_test, Y2_test)
if score > best_score:
best_score =score
best_decison_tree_model_tip =model
best_depth = depth
best_split = split
print ("Best Score : {} for max_depth :{} , min_sample_split : {} for DecisionTree Regresor".format(best_score, best_depth,best_split))
Henece, it's pretty clear that Decision Trees work best for prediction of both fare & tip amount prediction due to non-linear relationship among the variables
The location of NYC,as per Google, is given 40.71 , -74.00 so ideally the grid should contain the co-ordinates. The objective of grid is chosen such that the least frequency in the region is atleast 1000, centred around the mode.
For same reasons, I plot the ditribution of pickup & dropoff lat & long, to pick boundaries of the grid based on distribution
#Plotting the pickup latitude distribution
min_lat =40
max_lat = 42
trip_cleansed = trip[' pickup_latitude'][(trip[' pickup_latitude']> min_lat) & (trip[' pickup_latitude']< max_lat)]
plt.hist(trip_cleansed,alpha =0.5, bins = 1000)
plt.xlim([min_lat,max_lat])
plt.ylabel('No_of_trips')
plt.xlabel('pickup_latitude')
plt.yscale('log')
plt.savefig('trip_pickup_latitude_overall_distribution.png')
plt.show()
#Plotting the dropoff latitude distribution
min_lat =40
max_lat = 42
trip_cleansed = trip[' dropoff_latitude'][(trip[' dropoff_latitude']> min_lat) & (trip[' dropoff_latitude']< max_lat)]
plt.hist(trip_cleansed,alpha =0.5, bins = 1000)
plt.xlim([min_lat,max_lat])
plt.ylabel('No_of_trips')
plt.xlabel('dropoff_latitude')
plt.yscale('log')
plt.savefig('trip_dropoff_latitude_overall_distribution.png')
plt.show()
#Plotting the pickup longitude distribution
min_long =-75
max_long = -73
trip_cleansed = trip[' pickup_longitude'][(trip[' pickup_longitude']> min_long) & (trip[' pickup_longitude']< max_long)]
plt.hist(trip_cleansed,alpha =0.5, bins = 1000)
plt.xlim([min_long,max_long])
plt.ylabel('No_of_trips')
plt.xlabel('pickup_longitude')
plt.yscale('log')
plt.savefig('trip_pickup_longitude_overall_distribution.png')
plt.show()
#Plotting the pickup longitude distribution
min_long =-75
max_long = -73
trip_cleansed = trip[' pickup_longitude'][(trip[' pickup_longitude']> min_long) & (trip[' pickup_longitude']< max_long)]
plt.hist(trip_cleansed,alpha =0.5, bins = 1000)
plt.xlim([min_long,max_long])
plt.ylabel('No_of_trips')
plt.xlabel('pickup_longitude')
plt.yscale('log')
plt.savefig('trip_pickup_longitude_overall_distribution.png')
plt.show()
#Plotting the dropoff longitude distribution
min_long =-75
max_long = -73
trip_cleansed = trip[' dropoff_longitude'][(trip[' dropoff_longitude']> min_long) & (trip[' dropoff_longitude']< max_long)]
plt.hist(trip_cleansed,alpha =0.5, bins = 1000)
plt.xlim([min_long,max_long])
plt.ylabel('No_of_trips')
plt.xlabel('dropoff_longitude')
plt.yscale('log')
plt.savefig('trip_dropoff_longitude_overall_distribution.png')
plt.show()
As the above distributions point out, the distributions are centred (mode) around approximately 40.75,-73.85, we take regions centred around the same, till the monthly count of trips doesn't fall below 1000(that's just 30 daily trips at the location)
Also, as per research, NYC ~ 800 sq km, and every 1 degree lat ~ 110 km and 1 degree long ~ 90 km at 40 degree lat - we are assuming the above chosen rectangular grid can very weel mimic the dynamics of NYC city to a good extent
min_lat = 40.65
max_lat = 40.85
min_long = -74.05
max_long = -73.75
df = trip[(trip[' pickup_latitude']> min_lat) & (trip[' pickup_latitude']< max_lat)]
df = df[(df[' pickup_longitude']> min_long) & (df[' pickup_longitude']< max_long)]
df = df[(df[' dropoff_latitude']> min_lat) & (df[' dropoff_latitude']< max_lat)]
df = df[(df[' dropoff_longitude']> min_long) & (df[' dropoff_longitude']< max_long)]
plt.hist(df[' pickup_latitude'],alpha =0.5, bins = 1000)
plt.xlim([min_lat,max_lat])
plt.ylabel('No_of_trips')
plt.xlabel('pickup_latitude_final_grid')
plt.yscale('log')
plt.savefig('trip_pickup_latitude_final_distribution.png')
plt.show()
plt.hist(df[' pickup_longitude'],alpha =0.5, bins = 1000)
plt.xlim([min_long,max_long])
plt.ylabel('No_of_trips')
plt.xlabel('pickup_longitude')
plt.yscale('log')
plt.savefig('trip_pickup_longitude_final_distribution.png')
plt.show()
Y=df[' fare_amount'][df[' fare_amount']>0]
X = pd.DataFrame(df[df[' fare_amount']>0][' trip_time_in_secs'])
X['square_x'] = X[' trip_time_in_secs']**2
X['cube_X'] = X[' trip_time_in_secs']**3
X['inverse_X'] = (X[' trip_time_in_secs'].add(.00001))**-1
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
random_state = 0,train_size = 0.8)
alphas= [1e-6,1e-3, 1e0, 1e3]
best_score =0
for alpha in alphas:
model=Ridge(alpha=alpha, normalize = True, fit_intercept=True)
model.fit(X_train,Y_train)
score = model.score(X_test,Y_test)
if score > best_score:
best_score =score
best_model = model
print (best_score)
print ("Intercept term : {}".format(best_model.intercept_))
for i,items in enumerate(best_model.coef_):
print ("Coefficient {} : {}".format(X.columns[i], items))
As the coefficients point out, the 'trip_fare' is dependent linearly on 'trip_time'
df = pd.read_pickle(filename)
print (len(df_hourly_rate_per_route))
print (len(df_hourly_rate_per_route['mean_fare_per_min'][df_hourly_rate_per_route['mean_fare_per_min'] < 4 ]))
df_hourly_rate_per_route['mean_fare_per_min'].hist(bins=1000)
plt.xlim(0,8)
plt.savefig('mean_fare_rate')
plt.show()
plt.hist(df[' fare_amount'],bins=1000, alpha =0.5)
plt.yscale('log')
plt.xlabel('fare_amount')
plt.ylabel('no of trips')
plt.title('Distribution of fare')
plt.savefig('Fare amount in the grid.png')
plt.show()
df_19 = df[df['pickup_hour']==19]
plt.hist(df_19[' trip_distance'],alpha =0.5, bins = 100)
plt.xlim([0,30])
plt.yscale('log')
plt.ylabel('No of trips')
plt.xlabel('Trip Distance')
plt.title('Distribution of trip distance at 19:00 hrs')
# plt.savefig('tip_amount_distribution.png')
plt.show()
df_12 = df[df['pickup_hour']==12]
plt.hist(df_12[' trip_distance'],alpha =0.5, bins = 100)
plt.xlim([0,30])
plt.yscale('log')
plt.ylabel('No of trips')
plt.xlabel('Trip Distance')
plt.title('Distribution of trip distance at 12:00 hours')
# plt.savefig('tip_amount_distribution.png')
plt.show()