defcrowd_refine(train_raw): crowd = train_raw['Crowd'] # get ads which has multiple crowd attributes multifield = crowd[crowd.str.contains(r'\|')] # find how many unique attributes exist in Crowd column matches = [] for line in multifield: match = re.findall(r'\w+:', line) matches.extend(match) crowd_attr = set(matches) crowd_attr = list(map(lambda x: x[:-1],crowd_attr)) # expand the crowd to multiple columns rows_list = [] for row in crowd: dict1 = {} for attr in crowd_attr: attrpat = attr + ':[0-9]+[,[0-9]+]*' match = re.findall(attrpat, row) if len(match) == 0: value = np.nan else: value = re.search(r'[0-9]+[,[0-9]+]*',match[0]).group() dict1.update({attr:value}) rows_list.append(dict1) # age,area is meaningless values, discard df_crowd = pd.DataFrame(rows_list).drop(columns=['age','area','consuptionAbility','device','gender','work','behavior','os']) # # one special case of the attribute: all is not defined, will do it later. return df_crowd
df_crowd = crowd_refine(df_combine)
use one hot and hashing to encode each features.
why age feature has sooooo many categories??? values = 998…so scary.Ok I checked the doc, age is a random number…
Only education/status/connecttype has enough valid values
1 2 3 4 5 6 7 8 9 10 11
defonehot_encode_features(df,df_combine): attrlist = df.columns df_crowd_expand = pd.DataFrame() for attr in attrlist: p = df[attr].str.get_dummies(sep=",") p.columns = list(map(lambda x: attr + x, p.columns)) # deal with crowd == all all_list = df_combine[df_combine['Crowd'] == 'all'].index.to_list() p.iloc[all_list,0] = 1.0 df_crowd_expand = pd.concat([df_crowd_expand,p],axis=1) return df_crowd_expand
defbuildbase(df_combine): base = df_combine.drop(columns=['ExpoTime','Crowd']) # for bid # in train set, max bid = 10000, testset, maxbid = 20000,so large values should be kept # convert the bid to log scale for even distribution base['Bid'] = np.log1p(base['Bid']) # fix Adindustry id x = base[base['AdIndustryId'].astype(str).str.contains(',')]['AdIndustryId'].map(lambda x: x.split(',')).map(lambda x: x[0]) y = base[~base['AdIndustryId'].astype(str).str.contains(',')]['AdIndustryId'] base['AdIndustryId'] = y.append(x).sort_index() # build weekday/hour attribute for ad creation date base['hour'] = pd.to_datetime(base['Date'],unit='s').dt.hour base['dayofweek'] = pd.to_datetime(base['Date'],unit='s').dt.dayofweek # unify attributes base = base.drop(columns=['AdId','Date','CommodityId','AdAccountId']) strlist = ['AdSize','AdIndustryId','CommodityType','hour','dayofweek'] base[strlist] = base[strlist].astype(str) # add all features together base = pd.get_dummies(base) return base
defbuild_train_test(base,df_crowd_expand,expotime,Y_train): base2 = pd.concat([df_crowd_expand,expotime],axis=1) final = pd.concat([base,base2],axis=1) del base del base2 # build train/test X_train = final.loc[final['trainset'] == 1,].drop(columns='trainset') X_test = final.loc[final['trainset'] == 0,].drop(columns='trainset') # fill Y_Train nan with zero Y_train = Y_train.fillna(0) return X_train, Y_train, X_test