The poor rabbit chased by Python and Anaconda :p

0%

Tencent 2019 Data Science Competition[Ads Exposure Rate] Part 3

Just another day of Data cleaning… this dataset really required lots and lots of cleaning…

Data prepare: continued

From last notebook we’ve get the correct label: next 24 hour exposure rate.

now we need to further finalize the training dataset and test dataset to make them ready for the ML imput

import libs

1
2
3
4
5
6
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction import FeatureHasher
import seaborn as sns
%matplotlib inline

load data

1
2
3
with open('train_final.csv') as f:
train_raw = pd.read_csv(f)
train_raw.drop(columns = 'Unnamed: 0',inplace=True)
1
train_raw.head()

Id Date_create Bid Crowd Time AdAccountId CommodityId CommodityType AdIndustryId AdSize Date_p24h exporate
0 32 1543563617 83 area:7572 70368475742208,70368475742208,70368475742208,7... 18752 32534 13 136 40 1543650017 NaN
1 32 1550727091 91 area:7572 70368475742208,70368475742208,70368475742208,7... 18752 32534 13 136 40 1550813491 6.0
2 32 1551110479 90 area:7572 70368475742208,70368475742208,70368475742208,7... 18752 32534 13 136 40 1551196879 3.0
3 32 1551331895 106 area:7572 70368475742208,70368475742208,70368475742208,7... 18752 32534 13 136 40 1551418295 1.0
4 32 1551504704 97 area:7572 70368475742208,70368475742208,70368475742208,7... 18752 32534 13 136 40 1551591104 NaN

combine train data and test data to process together.

1
2
3
4
collist_test = ['Id','AdId','Date','AdSize','AdIndustryId','CommodityType','CommodityId',
'AdAccountId','ExpoTime','Crowd','Bid']
with open('test_sample.dat') as ft:
df_test = pd.read_csv(ft,sep='\t',names = collist_test)
1
df_test.head()

Id AdId Date AdSize AdIndustryId CommodityType CommodityId AdAccountId ExpoTime Crowd Bid
0 1 394352 1529648412 34 84 13 29663 26657 281474976645120,281474976645120,28147497664512... age:819,608,988,741,202,837,400,394,942,361,72... 120
1 2 585401 1553076190 40 221 1 -1 6262 281474976579587,281474976579587,28147497657958... age:819,433,479,741,229,347,522,79,753,601|edu... 42
2 3 419408 1553031394 30 122 13 32110 17436 17592185782272,17592185782272,17592185782272,1... all 6
3 4 405326 1553238836 64 136 1 -1 22359 281474976694272,281474976694272,28147497669427... age:333,1|gender:2|area:11505,1874,3790,4566,5... 181
4 5 578942 1541191585 34 12 13 6372 24082 68719214592,68719214592,68719214592,6871921459... age:819,608,988,741,202,837,400,394,942,361,72... 31
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
def test_train_combine(train_raw,df_test):
# reformat the train dataset
Y_train = train_raw['exporate']
train_raw = train_raw.drop(columns=['Date_p24h','exporate'])
collist = ['Id', 'Date_create','AdSize','AdIndustryId','CommodityType',
'CommodityId','AdAccountId','Time', 'Crowd','Bid']
train_raw = train_raw[collist]
# replace some names to make it consistent with test
train_raw.rename(columns={'Id':'AdId','Date_create':'Date','Time':'ExpoTime'},inplace=True)
train_raw['trainset'] = 1
# next for df_test
df_test.drop(columns='Id',inplace=True)
df_test['trainset'] = 0
df_combine = pd.concat([train_raw,df_test]).reset_index(drop=True)
return df_combine, Y_train
1
df_combine,Y_train = test_train_combine(train_raw,df_test)
1
df_combine.loc[df_combine['Crowd'] == 'all',].head()

AdId Date AdSize AdIndustryId CommodityType CommodityId AdAccountId ExpoTime Crowd Bid trainset
48 415 1547789504 1 117 1 -1 17240 17592185782272,17592185782272,17592185782272,1... all 300 1
49 415 1550532431 1 117 1 -1 17240 17592185782272,17592185782272,17592185782272,1... all 300 1
50 415 1551787638 1 117 1 -1 17240 17592185782272,17592185782272,17592185782272,1... all 600 1
71 635 1552887473 40 122 13 22528 7672 17575274348544,17575274348544,17575274348544,1... all 90 1
136 1090 1552369106 30 122 13 26955 1050 69273464340480,69273464340480,69273464340480,6... all 34 1

deal with crowd.

The crowd attribute represent what kind of user the ad is targeting on. we need to split it into multi-column features to make it useful.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
def crowd_refine(train_raw):
crowd = train_raw['Crowd']
# get ads which has multiple crowd attributes
multifield = crowd[crowd.str.contains(r'\|')]
# find how many unique attributes exist in Crowd column
matches = []
for line in multifield:
match = re.findall(r'\w+:', line)
matches.extend(match)
crowd_attr = set(matches)
crowd_attr = list(map(lambda x: x[:-1],crowd_attr))
# expand the crowd to multiple columns
rows_list = []
for row in crowd:
dict1 = {}
for attr in crowd_attr:
attrpat = attr + ':[0-9]+[,[0-9]+]*'
match = re.findall(attrpat, row)
if len(match) == 0:
value = np.nan
else:
value = re.search(r'[0-9]+[,[0-9]+]*',match[0]).group()
dict1.update({attr:value})
rows_list.append(dict1)
# age,area is meaningless values, discard
df_crowd = pd.DataFrame(rows_list).drop(columns=['age','area','consuptionAbility','device','gender','work','behavior','os'])
#
# one special case of the attribute: all is not defined, will do it later.
return df_crowd

df_crowd = crowd_refine(df_combine)

use one hot and hashing to encode each features.

  • why age feature has sooooo many categories??? values = 998…so scary.Ok I checked the doc, age is a random number…
  • Only education/status/connecttype has enough valid values
1
2
3
4
5
6
7
8
9
10
11
def onehot_encode_features(df,df_combine):
attrlist = df.columns
df_crowd_expand = pd.DataFrame()
for attr in attrlist:
p = df[attr].str.get_dummies(sep=",")
p.columns = list(map(lambda x: attr + x, p.columns))
# deal with crowd == all
all_list = df_combine[df_combine['Crowd'] == 'all'].index.to_list()
p.iloc[all_list,0] = 1.0
df_crowd_expand = pd.concat([df_crowd_expand,p],axis=1)
return df_crowd_expand
1
df_crowd_expand = onehot_encode_features(df_crowd,df_combine)
1
df_crowd_expand.shape
(121130, 28)

now deal with the expo time

the expotime has seven int numbers, each represent 1 day in a week. convert the int to 48 binanry numbers, 1 = post, 0 = non post.

  • split the expotime to seven days
  • for each day, convert the int to binanry and split.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
def expotime_refine(train_raw):
expotime = train_raw['ExpoTime'].str.split(',',expand=True)
# convert to 48-digit binanry
expotime = expotime.astype(int).applymap(lambda x: f'{x:48b}'.replace(' ','0'))
count = 0
for col in expotime.columns:
a1 = expotime[col].apply(lambda x: pd.Series(list(x)))
if count == 0:
base = pd.DataFrame(a1)
else:
base = pd.DataFrame(np.hstack((base,a1)))
count += 1
return base

expotime = expotime_refine(df_combine)
1
expotime.shape
(121130, 336)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
def buildbase(df_combine):
base = df_combine.drop(columns=['ExpoTime','Crowd'])
# for bid
# in train set, max bid = 10000, testset, maxbid = 20000,so large values should be kept
# convert the bid to log scale for even distribution
base['Bid'] = np.log1p(base['Bid'])
# fix Adindustry id
x = base[base['AdIndustryId'].astype(str).str.contains(',')]['AdIndustryId'].map(lambda x: x.split(',')).map(lambda x: x[0])
y = base[~base['AdIndustryId'].astype(str).str.contains(',')]['AdIndustryId']
base['AdIndustryId'] = y.append(x).sort_index()
# build weekday/hour attribute for ad creation date
base['hour'] = pd.to_datetime(base['Date'],unit='s').dt.hour
base['dayofweek'] = pd.to_datetime(base['Date'],unit='s').dt.dayofweek
# unify attributes
base = base.drop(columns=['AdId','Date','CommodityId','AdAccountId'])
strlist = ['AdSize','AdIndustryId','CommodityType','hour','dayofweek']
base[strlist] = base[strlist].astype(str)
# add all features together
base = pd.get_dummies(base)
return base

def build_train_test(base,df_crowd_expand,expotime,Y_train):
base2 = pd.concat([df_crowd_expand,expotime],axis=1)
final = pd.concat([base,base2],axis=1)
del base
del base2
# build train/test
X_train = final.loc[final['trainset'] == 1,].drop(columns='trainset')
X_test = final.loc[final['trainset'] == 0,].drop(columns='trainset')
# fill Y_Train nan with zero
Y_train = Y_train.fillna(0)
return X_train, Y_train, X_test
1
base = buildbase(df_combine)
1
X_train, Y_train, X_test = build_train_test(base,df_crowd_expand,expotime,Y_train)
1
2
3
X_train.shape
Y_train.shape
X_test.shape
(100840, 627)






(100840,)






(20290, 627)
1
X_train.to_csv('X_train.csv')
1
Y_train.to_csv('Y_train.csv')
1
X_test.to_csv('X_test.csv')

now we finally have a ready to train dataset.. Next let’s try, finally, machine learning part.