Just another day of Data cleaning… this dataset really required lots and lots of cleaning…

Data prepare: continued

From last notebook we’ve get the correct label: next 24 hour exposure rate.

now we need to further finalize the training dataset and test dataset to make them ready for the ML imput

import libs

import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction import FeatureHasher
import seaborn as sns
%matplotlib inline

load data

1
2
3

with open('train_final.csv') as f:
    train_raw = pd.read_csv(f)
    train_raw.drop(columns = 'Unnamed: 0',inplace=True)

1	`train_raw.head()`

	Id	Date_create	Bid	Crowd	Time	AdAccountId	CommodityId	CommodityType	AdIndustryId	AdSize	Date_p24h	exporate
0	32	1543563617	83	area:7572	70368475742208,70368475742208,70368475742208,7...	18752	32534	13	136	40	1543650017	NaN
1	32	1550727091	91	area:7572	70368475742208,70368475742208,70368475742208,7...	18752	32534	13	136	40	1550813491	6.0
2	32	1551110479	90	area:7572	70368475742208,70368475742208,70368475742208,7...	18752	32534	13	136	40	1551196879	3.0
3	32	1551331895	106	area:7572	70368475742208,70368475742208,70368475742208,7...	18752	32534	13	136	40	1551418295	1.0
4	32	1551504704	97	area:7572	70368475742208,70368475742208,70368475742208,7...	18752	32534	13	136	40	1551591104	NaN

combine train data and test data to process together.

collist_test = ['Id','AdId','Date','AdSize','AdIndustryId','CommodityType','CommodityId',
                'AdAccountId','ExpoTime','Crowd','Bid']
with open('test_sample.dat') as ft:
    df_test = pd.read_csv(ft,sep='\t',names = collist_test)

1	`df_test.head()`

	Id	AdId	Date	AdSize	AdIndustryId	CommodityType	CommodityId	AdAccountId	ExpoTime	Crowd	Bid
0	1	394352	1529648412	34	84	13	29663	26657	281474976645120,281474976645120,28147497664512...	age:819,608,988,741,202,837,400,394,942,361,72...	120
1	2	585401	1553076190	40	221	1	-1	6262	281474976579587,281474976579587,28147497657958...	age:819,433,479,741,229,347,522,79,753,601\|edu...	42
2	3	419408	1553031394	30	122	13	32110	17436	17592185782272,17592185782272,17592185782272,1...	all	6
3	4	405326	1553238836	64	136	1	-1	22359	281474976694272,281474976694272,28147497669427...	age:333,1\|gender:2\|area:11505,1874,3790,4566,5...	181
4	5	578942	1541191585	34	12	13	6372	24082	68719214592,68719214592,68719214592,6871921459...	age:819,608,988,741,202,837,400,394,942,361,72...	31

def test_train_combine(train_raw,df_test):
    # reformat the train dataset
    Y_train = train_raw['exporate']
    train_raw = train_raw.drop(columns=['Date_p24h','exporate'])
    collist = ['Id', 'Date_create','AdSize','AdIndustryId','CommodityType',
               'CommodityId','AdAccountId','Time', 'Crowd','Bid']
    train_raw = train_raw[collist]
    # replace some names to make it consistent with test
    train_raw.rename(columns={'Id':'AdId','Date_create':'Date','Time':'ExpoTime'},inplace=True)
    train_raw['trainset'] = 1
    # next for df_test
    df_test.drop(columns='Id',inplace=True)
    df_test['trainset'] = 0
    df_combine = pd.concat([train_raw,df_test]).reset_index(drop=True)
    return  df_combine, Y_train

1	`df_combine,Y_train = test_train_combine(train_raw,df_test)`

1	`df_combine.loc[df_combine['Crowd'] == 'all',].head()`

	AdId	Date	AdSize	AdIndustryId	CommodityType	CommodityId	AdAccountId	ExpoTime	Crowd	Bid	trainset
48	415	1547789504	1	117	1	-1	17240	17592185782272,17592185782272,17592185782272,1...	all	300	1
49	415	1550532431	1	117	1	-1	17240	17592185782272,17592185782272,17592185782272,1...	all	300	1
50	415	1551787638	1	117	1	-1	17240	17592185782272,17592185782272,17592185782272,1...	all	600	1
71	635	1552887473	40	122	13	22528	7672	17575274348544,17575274348544,17575274348544,1...	all	90	1
136	1090	1552369106	30	122	13	26955	1050	69273464340480,69273464340480,69273464340480,6...	all	34	1

deal with crowd.

The crowd attribute represent what kind of user the ad is targeting on. we need to split it into multi-column features to make it useful.

def crowd_refine(train_raw):
    crowd = train_raw['Crowd']
    # get ads which has multiple crowd attributes
    multifield = crowd[crowd.str.contains(r'\|')]
    # find how many unique attributes exist in Crowd column
    matches = []
    for line in multifield:
        match = re.findall(r'\w+:', line)
        matches.extend(match)
    crowd_attr = set(matches)
    crowd_attr = list(map(lambda x: x[:-1],crowd_attr))
    # expand the crowd to multiple columns
    rows_list = []
    for row in crowd:
        dict1 = {}
        for attr in crowd_attr:
            attrpat = attr + ':[0-9]+[,[0-9]+]*'
            match = re.findall(attrpat, row)
            if len(match) == 0:
                value = np.nan
            else:
                value = re.search(r'[0-9]+[,[0-9]+]*',match[0]).group()
            dict1.update({attr:value})
        rows_list.append(dict1)
        # age,area is meaningless values, discard
    df_crowd = pd.DataFrame(rows_list).drop(columns=['age','area','consuptionAbility','device','gender','work','behavior','os'])
    #
    # one special case of the attribute: all is not defined, will do it later.
    return df_crowd

df_crowd = crowd_refine(df_combine)

use one hot and hashing to encode each features.

why age feature has sooooo many categories??? values = 998…so scary.Ok I checked the doc, age is a random number…
Only education/status/connecttype has enough valid values

def onehot_encode_features(df,df_combine):
    attrlist = df.columns
    df_crowd_expand = pd.DataFrame()
    for attr in attrlist:
        p = df[attr].str.get_dummies(sep=",")
        p.columns = list(map(lambda x: attr + x, p.columns))
        # deal with crowd == all
        all_list = df_combine[df_combine['Crowd'] == 'all'].index.to_list()
        p.iloc[all_list,0] = 1.0
        df_crowd_expand = pd.concat([df_crowd_expand,p],axis=1)
    return df_crowd_expand

1	`df_crowd_expand = onehot_encode_features(df_crowd,df_combine)`

1	`df_crowd_expand.shape`

(121130, 28)

now deal with the expo time

the expotime has seven int numbers, each represent 1 day in a week. convert the int to 48 binanry numbers, 1 = post, 0 = non post.

split the expotime to seven days
for each day, convert the int to binanry and split.

def expotime_refine(train_raw):
    expotime = train_raw['ExpoTime'].str.split(',',expand=True)
    # convert to 48-digit binanry
    expotime = expotime.astype(int).applymap(lambda x: f'{x:48b}'.replace(' ','0'))
    count = 0
    for col in expotime.columns:
        a1 = expotime[col].apply(lambda x: pd.Series(list(x)))
        if count == 0:
            base = pd.DataFrame(a1)
        else:
            base = pd.DataFrame(np.hstack((base,a1)))
        count += 1
    return base

expotime = expotime_refine(df_combine)

1	`expotime.shape`

(121130, 336)

def buildbase(df_combine):
    base = df_combine.drop(columns=['ExpoTime','Crowd'])
    # for bid
    # in train set, max bid = 10000, testset, maxbid = 20000,so large values should be kept
    # convert the bid to log scale for even distribution
    base['Bid'] = np.log1p(base['Bid'])
    # fix Adindustry id
    x = base[base['AdIndustryId'].astype(str).str.contains(',')]['AdIndustryId'].map(lambda x: x.split(',')).map(lambda x: x[0])
    y = base[~base['AdIndustryId'].astype(str).str.contains(',')]['AdIndustryId']
    base['AdIndustryId'] = y.append(x).sort_index()
    # build weekday/hour attribute for ad creation date
    base['hour'] = pd.to_datetime(base['Date'],unit='s').dt.hour
    base['dayofweek'] = pd.to_datetime(base['Date'],unit='s').dt.dayofweek
    # unify attributes
    base = base.drop(columns=['AdId','Date','CommodityId','AdAccountId'])
    strlist = ['AdSize','AdIndustryId','CommodityType','hour','dayofweek']
    base[strlist] = base[strlist].astype(str)
    # add all features together
    base = pd.get_dummies(base)
    return base

def build_train_test(base,df_crowd_expand,expotime,Y_train):
    base2 = pd.concat([df_crowd_expand,expotime],axis=1)
    final = pd.concat([base,base2],axis=1)
    del base
    del base2
    # build train/test
    X_train = final.loc[final['trainset'] == 1,].drop(columns='trainset')
    X_test = final.loc[final['trainset'] == 0,].drop(columns='trainset')
    # fill Y_Train nan with zero
    Y_train = Y_train.fillna(0)
    return X_train, Y_train, X_test

1	`base = buildbase(df_combine)`

1	`X_train, Y_train, X_test = build_train_test(base,df_crowd_expand,expotime,Y_train)`

1
2
3

X_train.shape
Y_train.shape
X_test.shape

(100840, 627)






(100840,)






(20290, 627)

1	`X_train.to_csv('X_train.csv')`

1	`Y_train.to_csv('Y_train.csv')`

1	`X_test.to_csv('X_test.csv')`

now we finally have a ready to train dataset.. Next let’s try, finally, machine learning part.