The poor rabbit chased by Python and Anaconda :p

0%

Tencent 2019 Data Science Competition[Ads Exposure Rate] Extra: Data Visualization

Just for some extra fun, Let’s do some plots to explore the ads dataset a bit

1
2
3
4
5
6
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import calendar
%matplotlib inline

some fun visualization of the ads dataset

exposure rate distribution for the training set

1
2
with open('Y_train.csv') as f:
df_Y = pd.read_csv(f,header=None).drop(columns=0)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
def plot_exporate(df_Y):
sns.set(style="dark", palette="muted", color_codes=True)
f, axes = plt.subplots(1, 2, figsize=(14, 7))
sns.distplot(df_Y,ax=axes[0])
sns.distplot(np.log1p(df_Y),ax=axes[1])
axes[0].title.set_text('24 h exposure rate distribution')
axes[0].set_ylim(0,0.0002)
axes[0].set_xlabel('exposure rate')
axes[1].title.set_text('log 24 h exposure rate distribution')
axes[1].set_ylim(0,2)
axes[1].set_xlabel('log exposure rate')
plt.tight_layout()

plot_exporate(df_Y)

png

Summary: the distribution of y is skewed, it will be better to apply a np.log before apply machine learning algorithms.

plot training set features

1
2
3
with open('train_final.csv') as f:
train_raw = pd.read_csv(f)
train_raw.drop(columns = 'Unnamed: 0',inplace=True)
1
2
3
4
5
6
7
8
9
10
11
12
def refine_dataset(train_raw):
base= train_raw.copy()
base.drop(columns=['Crowd','AdAccountId','CommodityId','CommodityType','AdIndustryId','AdSize','Date_p24h'],inplace=True)
# plus 8 hours for gmt+8 beijing time
base['Date'] = pd.to_datetime(base['Date_create'],unit='s') + pd.DateOffset(hours=8)
base['hour'] = base['Date'].dt.hour
base['dayofweek'] = base['Date'].dt.dayofweek
base = base.fillna(0)
base_time = base['Time'].str.split(',',expand=True)
base = pd.concat([base,base_time],axis = 1)
return base
df_train = refine_dataset(train_raw)
1
2
3
4
5
6
7
8
9
10
def plot_ad_edits(df_train):
plt.figure(figsize=(14,10))
ad_edits = df_train['Id'].value_counts().sort_values()
sns.set(style="dark", palette="muted", color_codes=True)
sns.distplot(ad_edits.values)
f = plt.gcf()
plt.title('distribution of number of ad edits')
plt.xlabel('number of edits')
plt.ylabel('percentage')
plt.xticks(np.arange(min(ad_edits), max(ad_edits)+1, 1.0))
1
plot_ad_edits(df_train)

png

Summary: most ads never been edited during project period.

1
2
3
4
5
6
def plot_exporate_vs_day(df_train):
plotdata = df_train.groupby('dayofweek')['exporate'].mean().reset_index()
plotdata['weekdayname'] = [calendar.day_name[x] for x in plotdata['dayofweek'].astype(int).values]
plt.figure(figsize=(14,7))
sns.barplot(x='weekdayname',y='exporate',data = plotdata)
plt.title('Average exporate for each day of the week')
1
plot_exporate_vs_day(df_train)

png

1
2
3
4
5
def plot_hourly_ad_create(df_train):
plotdata = df_train.groupby('hour')['Id'].count().reset_index().rename(columns={'Id':'AdCounts'})
plt.figure(figsize=(14,7))
sns.barplot(x='hour',y='AdCounts',data = plotdata)
plt.title('Total ads created per hour')
1
plot_hourly_ad_create(df_train)

png

1
2
3
4
5
6
7
def plot_meanbid(df_train):
plotdata = df_train.groupby('Id')['Bid'].mean().reset_index().rename(columns={'Bid':'Mean Bid'})
plt.figure(figsize=(14,7))
sns.distplot(plotdata['Mean Bid'])
plt.ylim(0,0.001)
#plt.xlim(0,1000)
plt.title('Mean Bid distribution')
1
plot_meanbid(df_train)

png

1
2
3
4
5
6
7
8
9
10
11
12
13
def plot_expohours(df_train):
base = df_train.copy()
for i in range(0,7):
colname = calendar.day_name[i]
base[colname] = df_train.iloc[:,8:16].astype(int).applymap(lambda x: f'{x:48b}'.replace(' ','0'))[i].str.count('1')/2
base = base.loc[:,'Monday':'Sunday']
plt.figure(figsize=(14,7))
sns.violinplot(pd.melt(base)['variable'],pd.melt(base)['value'])
plt.title('Number of hours ads display per weekday')
plt.xlabel('')
plt.ylabel('Total hours')

plot_expohours(df_train)

png