Just for some extra fun, Let’s do some plots to explore the ads dataset a bit

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import calendar
%matplotlib inline

some fun visualization of the ads dataset

exposure rate distribution for the training set

1 2	`with open('Y_train.csv') as f: df_Y = pd.read_csv(f,header=None).drop(columns=0)`

def plot_exporate(df_Y):
    sns.set(style="dark", palette="muted", color_codes=True)
    f, axes = plt.subplots(1, 2, figsize=(14, 7))
    sns.distplot(df_Y,ax=axes[0])
    sns.distplot(np.log1p(df_Y),ax=axes[1])
    axes[0].title.set_text('24 h exposure rate distribution')
    axes[0].set_ylim(0,0.0002)
    axes[0].set_xlabel('exposure rate')
    axes[1].title.set_text('log 24 h exposure rate distribution')
    axes[1].set_ylim(0,2)
    axes[1].set_xlabel('log exposure rate')
    plt.tight_layout()

plot_exporate(df_Y)

Summary: the distribution of y is skewed, it will be better to apply a np.log before apply machine learning algorithms.

plot training set features

1
2
3

with open('train_final.csv') as f:
    train_raw = pd.read_csv(f)
    train_raw.drop(columns = 'Unnamed: 0',inplace=True)

def refine_dataset(train_raw):
    base= train_raw.copy()
    base.drop(columns=['Crowd','AdAccountId','CommodityId','CommodityType','AdIndustryId','AdSize','Date_p24h'],inplace=True)
    # plus 8 hours for gmt+8 beijing time
    base['Date'] = pd.to_datetime(base['Date_create'],unit='s') + pd.DateOffset(hours=8)
    base['hour'] = base['Date'].dt.hour
    base['dayofweek'] = base['Date'].dt.dayofweek
    base = base.fillna(0)
    base_time = base['Time'].str.split(',',expand=True)
    base = pd.concat([base,base_time],axis = 1)
    return base
df_train = refine_dataset(train_raw)

def plot_ad_edits(df_train):
    plt.figure(figsize=(14,10))
    ad_edits = df_train['Id'].value_counts().sort_values()
    sns.set(style="dark", palette="muted", color_codes=True)
    sns.distplot(ad_edits.values)
    f = plt.gcf()
    plt.title('distribution of number of ad edits')
    plt.xlabel('number of edits')
    plt.ylabel('percentage')
    plt.xticks(np.arange(min(ad_edits), max(ad_edits)+1, 1.0))

1	`plot_ad_edits(df_train)`

Summary: most ads never been edited during project period.

def plot_exporate_vs_day(df_train):
    plotdata = df_train.groupby('dayofweek')['exporate'].mean().reset_index()
    plotdata['weekdayname'] = [calendar.day_name[x] for x in plotdata['dayofweek'].astype(int).values]
    plt.figure(figsize=(14,7))
    sns.barplot(x='weekdayname',y='exporate',data = plotdata)
    plt.title('Average exporate for each day of the week')

1	`plot_exporate_vs_day(df_train)`

def plot_hourly_ad_create(df_train):
    plotdata = df_train.groupby('hour')['Id'].count().reset_index().rename(columns={'Id':'AdCounts'})
    plt.figure(figsize=(14,7))
    sns.barplot(x='hour',y='AdCounts',data = plotdata)
    plt.title('Total ads created per hour')

1	`plot_hourly_ad_create(df_train)`

def plot_meanbid(df_train):
    plotdata = df_train.groupby('Id')['Bid'].mean().reset_index().rename(columns={'Bid':'Mean Bid'})
    plt.figure(figsize=(14,7))
    sns.distplot(plotdata['Mean Bid'])
    plt.ylim(0,0.001)
    #plt.xlim(0,1000)
    plt.title('Mean Bid distribution')

1	`plot_meanbid(df_train)`

def plot_expohours(df_train):
    base = df_train.copy()
    for i in range(0,7):
        colname = calendar.day_name[i]
        base[colname] = df_train.iloc[:,8:16].astype(int).applymap(lambda x: f'{x:48b}'.replace(' ','0'))[i].str.count('1')/2
    base = base.loc[:,'Monday':'Sunday']
    plt.figure(figsize=(14,7))
    sns.violinplot(pd.melt(base)['variable'],pd.melt(base)['value'])
    plt.title('Number of hours ads display per weekday')
    plt.xlabel('')
    plt.ylabel('Total hours')

plot_expohours(df_train)