Summary: the distribution of y is skewed, it will be better to apply a np.log before apply machine learning algorithms.
plot training set features
1 2 3
with open('train_final.csv') as f: train_raw = pd.read_csv(f) train_raw.drop(columns = 'Unnamed: 0',inplace=True)
1 2 3 4 5 6 7 8 9 10 11 12
defrefine_dataset(train_raw): base= train_raw.copy() base.drop(columns=['Crowd','AdAccountId','CommodityId','CommodityType','AdIndustryId','AdSize','Date_p24h'],inplace=True) # plus 8 hours for gmt+8 beijing time base['Date'] = pd.to_datetime(base['Date_create'],unit='s') + pd.DateOffset(hours=8) base['hour'] = base['Date'].dt.hour base['dayofweek'] = base['Date'].dt.dayofweek base = base.fillna(0) base_time = base['Time'].str.split(',',expand=True) base = pd.concat([base,base_time],axis = 1) return base df_train = refine_dataset(train_raw)
1 2 3 4 5 6 7 8 9 10
defplot_ad_edits(df_train): plt.figure(figsize=(14,10)) ad_edits = df_train['Id'].value_counts().sort_values() sns.set(style="dark", palette="muted", color_codes=True) sns.distplot(ad_edits.values) f = plt.gcf() plt.title('distribution of number of ad edits') plt.xlabel('number of edits') plt.ylabel('percentage') plt.xticks(np.arange(min(ad_edits), max(ad_edits)+1, 1.0))
1
plot_ad_edits(df_train)
Summary: most ads never been edited during project period.
1 2 3 4 5 6
defplot_exporate_vs_day(df_train): plotdata = df_train.groupby('dayofweek')['exporate'].mean().reset_index() plotdata['weekdayname'] = [calendar.day_name[x] for x in plotdata['dayofweek'].astype(int).values] plt.figure(figsize=(14,7)) sns.barplot(x='weekdayname',y='exporate',data = plotdata) plt.title('Average exporate for each day of the week')
1
plot_exporate_vs_day(df_train)
1 2 3 4 5
defplot_hourly_ad_create(df_train): plotdata = df_train.groupby('hour')['Id'].count().reset_index().rename(columns={'Id':'AdCounts'}) plt.figure(figsize=(14,7)) sns.barplot(x='hour',y='AdCounts',data = plotdata) plt.title('Total ads created per hour')
defplot_expohours(df_train): base = df_train.copy() for i in range(0,7): colname = calendar.day_name[i] base[colname] = df_train.iloc[:,8:16].astype(int).applymap(lambda x: f'{x:48b}'.replace(' ','0'))[i].str.count('1')/2 base = base.loc[:,'Monday':'Sunday'] plt.figure(figsize=(14,7)) sns.violinplot(pd.melt(base)['variable'],pd.melt(base)['value']) plt.title('Number of hours ads display per weekday') plt.xlabel('') plt.ylabel('Total hours')