defdata_clean(df): # price drop nan, should only contain numbers, drop the record if not. temp = df[~df['price'].isnull()] df = temp[temp['price'].str.isdigit()].reset_index(drop=True) # reformat some features df['price'] = df['price'].astype(float) # some price is not reasonable, drop them df = df.loc[df['price'] > 140,].reset_index(drop=True) df['suburb'] = df['address'].str.rsplit(',',1,expand=True)[1].str.strip() df['bond'] = df['bond'].str.split('$',expand=True)[1].str.replace(',','').astype(float) df['agent_suburb'] = df['agent_brand'].str.split('-',expand=True)[1] df['agent_brand'] = df['agent_brand'].str.split('-',expand=True)[0] df['agent_brand'] = df['agent_brand'].str.strip() temp = df['agent_name'].str.split(':',expand=True) replace = temp.loc[temp[0].str.lower() == 'agent',] old = temp.loc[temp[0].str.lower() != 'agent',] replace[0] = replace[1] df['agent_name'] = pd.concat([old,replace]).sort_index()[0] return df df_clean = data_clean(df)
/Users/yaoyao/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:17: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
1
df_clean.head()
property_type
price
bond
address
feature_bedroom
feature_bathroom
feature_parking
agent_brand
agent_name
available_date
property_details
suburb
agent_suburb
0
Apartment
450.0
1956.0
102/40 Bettina Street, Clayton
2
1
1
Harcourts
Steven Ker
Open Sat 16 Nov 11:45am View allAvailable 25 N...
First floor apartment with balcony consists of...
Clayton
Judd White
1
Townhouse
560.0
2433.0
15 Gentle Street, Clayton
3
2
2
Buxton
Talia Karagaslis
Available 20 Dec 2019
This beautiful three bedroom townhouse is loca...
Clayton
Oakleigh
2
Townhouse
590.0
2564.0
4 Main Road, Clayton
3
2
2
Ray White
Anthony Lu
Open today 4:45pm View allAvailable 22 Nov 2019
Only minutes to Clayton train station, bus sto...
Clayton
Oakleigh
3
Apartment
340.0
1477.0
7/126 Wellington Road, Clayton
2
1
1
Ray White
Anthony Lu
Open today 12:00pm View allAvailable 25 Nov 2019
Located within walking distance to Monash Univ...
Clayton
Oakleigh
4
Unit
475.0
2064.0
Address available on request, Clayton
3
1
2
Ray White
Johnson Tan
Available 01 Dec 2019
* Three generous-sized Bedrooms with Built-In ...
Clayton
Oakleigh
apply basic text cleaning for the property details.
defremove_punct(text): newtext = "".join([char for char in text if char notin string.punctuation]) newtext = re.sub('[0-9\t/•]+', '', newtext) return newtext
defremove_stopword(text): newtext = [word for word in text if word notin nltk.corpus.stopwords.words('english')] return newtext
defapply_lemmatization(text): WNlemma = nltk.WordNetLemmatizer() newtext = [WNlemma.lemmatize(word) for word in text] return newtext
defbuild_input(df): listall = [] # list some common words that you don't want them present in the world cloud noshow = ['inspection','time','open','book','apartment','house','studio','email','agent', 'cancellation','update','change','informed','property','inspect','townhouse'] for line in df: line_All = " ".join(word for word in line if word notin noshow) listall.append(line_All) showall = "".join(line for line in listall) return showall
defbuild_input_wf(df): listall = [] # list some common words that you don't want them present in the world cloud noshow = ['inspection','time','open','book','apartment','house','studio','email','agent', 'cancellation','update','change','informed','property','inspect','townhouse'] for line in df: line_All = [word for word in line if word notin noshow] listall.extend(line_All) #showall = "".join(line for line in listall) return listall