The poor rabbit chased by Python and Anaconda :p

0%

Melbourne Rental Project Part 3: word2vec

Now let’s try word2vec on the property details page and see if we can find something interesting.

load libs

1
2
3
4
5
6
7
8
9
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gensim
import nltk
import string
import re
%matplotlib inline

Load data

1
2
with open('14_11_2019_rentals_raw.csv') as f:
df = pd.read_csv(f)

Basic data prepare

reformat the input

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
def data_clean(df):
# price drop nan, should only contain numbers, drop the record if not.
temp = df[~df['price'].isnull()]
df = temp[temp['price'].str.isdigit()].reset_index(drop=True)
# reformat some features
df['price'] = df['price'].astype(float)
# some price is not reasonable, drop them
df = df.loc[df['price'] > 140,].reset_index(drop=True)
df['suburb'] = df['address'].str.rsplit(',',1,expand=True)[1].str.strip()
df['bond'] = df['bond'].str.split('$',expand=True)[1].str.replace(',','').astype(float)
df['agent_suburb'] = df['agent_brand'].str.split('-',expand=True)[1]
df['agent_brand'] = df['agent_brand'].str.split('-',expand=True)[0]
df['agent_brand'] = df['agent_brand'].str.strip()
temp = df['agent_name'].str.split(':',expand=True)
replace = temp.loc[temp[0].str.lower() == 'agent',]
old = temp.loc[temp[0].str.lower() != 'agent',]
replace[0] = replace[1]
df['agent_name'] = pd.concat([old,replace]).sort_index()[0]
return df
df_clean = data_clean(df)
/Users/yaoyao/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:17: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
1
df_clean.head()

property_type price bond address feature_bedroom feature_bathroom feature_parking agent_brand agent_name available_date property_details suburb agent_suburb
0 Apartment 450.0 1956.0 102/40 Bettina Street, Clayton 2 1 1 Harcourts Steven Ker Open Sat 16 Nov 11:45am View allAvailable 25 N... First floor apartment with balcony consists of... Clayton Judd White
1 Townhouse 560.0 2433.0 15 Gentle Street, Clayton 3 2 2 Buxton Talia Karagaslis Available 20 Dec 2019 This beautiful three bedroom townhouse is loca... Clayton Oakleigh
2 Townhouse 590.0 2564.0 4 Main Road, Clayton 3 2 2 Ray White Anthony Lu Open today 4:45pm View allAvailable 22 Nov 2019 Only minutes to Clayton train station, bus sto... Clayton Oakleigh
3 Apartment 340.0 1477.0 7/126 Wellington Road, Clayton 2 1 1 Ray White Anthony Lu Open today 12:00pm View allAvailable 25 Nov 2019 Located within walking distance to Monash Univ... Clayton Oakleigh
4 Unit 475.0 2064.0 Address available on request, Clayton 3 1 2 Ray White Johnson Tan Available 01 Dec 2019 * Three generous-sized Bedrooms with Built-In ... Clayton Oakleigh

apply basic text cleaning for the property details.

The basic cleaning should include:

  • remove punctuations
  • convert to all lower cases
  • word tokenize
  • word lemmatization
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
def remove_punct(text):
newtext = "".join([char for char in text if char not in string.punctuation])
newtext = re.sub('[0-9\t/•]+', '', newtext)
return newtext

def remove_stopword(text):
newtext = [word for word in text if word not in nltk.corpus.stopwords.words('english')]
return newtext

def apply_lemmatization(text):
WNlemma = nltk.WordNetLemmatizer()
newtext = [WNlemma.lemmatize(word) for word in text]
return newtext

def text_clean(df_clean):
df_clean['pd_punct'] = df_clean['property_details'].apply(lambda x: remove_punct(x)).str.lower()
temp = df_clean['pd_punct'].apply(lambda x: nltk.word_tokenize(x))
temp = temp.apply(lambda x: remove_stopword(x))
df_clean['pd_final'] = temp.apply(lambda x: apply_lemmatization(x))

text_clean(df_clean)

Some EDA and Visualization

Word Cloud

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
def build_input(df):
listall = []
# list some common words that you don't want them present in the world cloud
noshow = ['inspection','time','open','book','apartment','house','studio','email','agent',
'cancellation','update','change','informed','property','inspect','townhouse']
for line in df:
line_All = " ".join(word for word in line if word not in noshow)
listall.append(line_All)
showall = "".join(line for line in listall)
return showall

def plot_wordcloud(df_clean):
from wordcloud import WordCloud, STOPWORDS , ImageColorGenerator
df_apt = df_clean.loc[df_clean['property_type'] == 'Apartment','pd_final']
df_townhouse = df_clean.loc[df_clean['property_type'] == 'Townhouse','pd_final']
df_house = df_clean.loc[df_clean['property_type'] == 'House','pd_final']
df_studio = df_clean.loc[df_clean['property_type'] == 'Studio','pd_final']

list_apt = build_input(df_apt)
list_townhouse = build_input(df_townhouse)
list_house = build_input(df_apt)
list_studio = build_input(df_studio)

fig, ax = plt.subplots(4, 1, figsize = (30,40))
wordcloud_apt = WordCloud(max_font_size=30, max_words=100, background_color="white") \
.generate(list_apt)
wordcloud_townhouse = WordCloud(max_font_size=30, max_words=100, background_color="white") \
.generate(list_townhouse)
wordcloud_house = WordCloud(max_font_size=30, max_words=100, background_color="white") \
.generate(list_house)
wordcloud_studio = WordCloud(max_font_size=30, max_words=100, background_color="white") \
.generate(list_studio)
ax[0].imshow(wordcloud_apt, interpolation='bilinear')
ax[0].set_title("Common Words in Apartment Description",fontsize="20")
ax[0].axis('off')
ax[1].imshow(wordcloud_townhouse, interpolation='bilinear')
ax[1].set_title("Common Words in Town House Description",fontsize="20")
ax[1].axis('off')
ax[2].imshow(wordcloud_house, interpolation='bilinear')
ax[2].set_title("Common Words in House Description",fontsize="20")
ax[2].axis('off')
ax[3].imshow(wordcloud_studio, interpolation='bilinear')
ax[3].set_title("Common Words in Studio Description",fontsize="20")
ax[3].axis('off')

plot_wordcloud(df_clean)

png

This is a quite interesting plot I really like it.

  • the key words for house/townhouse/apartment/studio are quite different
  • It’s easy to spot some quite interesting pattern. eg. for studio, fully furnished is a key word not appearing in other types of properties.
  • train station, walking distance, steel appliance are quite common

Word frequency

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
def build_input_wf(df):
listall = []
# list some common words that you don't want them present in the world cloud
noshow = ['inspection','time','open','book','apartment','house','studio','email','agent',
'cancellation','update','change','informed','property','inspect','townhouse']
for line in df:
line_All = [word for word in line if word not in noshow]
listall.extend(line_All)
#showall = "".join(line for line in listall)
return listall

def plot_word_frequency(df_clean):
from nltk.probability import FreqDist
df_apt = df_clean.loc[df_clean['property_type'] == 'Apartment','pd_final']
df_townhouse = df_clean.loc[df_clean['property_type'] == 'Townhouse','pd_final']
df_house = df_clean.loc[df_clean['property_type'] == 'House','pd_final']
df_studio = df_clean.loc[df_clean['property_type'] == 'Studio','pd_final']

list_apt = build_input_wf(df_apt)
list_townhouse = build_input_wf(df_townhouse)
list_house = build_input_wf(df_apt)
list_studio = build_input_wf(df_studio)
frequency_house = FreqDist(list_house)

return frequency_house.most_common(n=20)
1
plot_word_frequency(df_clean)
[('bedroom', 2769),
 ('melbourne', 1854),
 ('kitchen', 1713),
 ('living', 1494),
 ('area', 1438),
 ('bathroom', 1427),
 ('one', 1082),
 ('feature', 1052),
 ('space', 1032),
 ('room', 1030),
 ('city', 978),
 ('view', 943),
 ('located', 938),
 ('laundry', 932),
 ('building', 917),
 ('floor', 897),
 ('access', 895),
 ('appliance', 894),
 ('private', 878),
 ('street', 864)]

word2vec

build vocal

1
wordlist = df_clean['pd_final'].to_list()
1
2
3
4
5
6
7
model = gensim.models.Word2Vec(
wordlist,
size=150,
window=10,
min_count=2,
workers=10,
iter=10)
1
2
w1 = 'train'
model.wv.most_similar(positive=w1)
[('bus', 0.9083044528961182),
 ('stop', 0.8881006240844727),
 ('railway', 0.8856533169746399),
 ('station', 0.812629759311676),
 ('route', 0.8058450818061829),
 ('springvale', 0.8043614625930786),
 ('tram', 0.7951110601425171),
 ('center', 0.7859233617782593),
 ('shopping', 0.7831301689147949),
 ('parliament', 0.781488299369812)]
1
model.wv.__getitem__('train')
array([-0.9055207 ,  0.3383803 , -1.0070474 ,  0.6496934 ,  0.5079288 ,
        0.2651052 ,  0.28885823, -0.37675577,  0.20077397,  2.2651038 ,
       -0.634687  , -0.12877388, -1.1244543 , -0.16395192, -1.1265266 ,
        1.987248  , -1.7640041 ,  0.0111573 ,  0.5782551 ,  0.3475309 ,
        3.0830522 ,  1.7851418 ,  1.8113701 ,  1.2043551 , -0.494675  ,
       -0.38805577, -0.84290814,  0.3232301 , -0.2537486 ,  0.6098439 ,
        0.87573195, -1.6538352 ,  0.7550999 , -0.18528049,  0.22006646,
       -0.05651824, -2.0897374 , -0.3931295 ,  2.0241232 ,  0.65036017,
        0.38358206,  0.40651393, -0.04384508,  1.4108607 ,  0.6118807 ,
       -0.19642885,  1.3095136 ,  0.7283475 ,  0.6006522 , -1.2531968 ,
        0.14988308,  1.6452932 , -0.70153093,  0.8074702 , -2.211348  ,
       -1.236841  ,  0.4659164 ,  0.16054933, -1.6786027 ,  0.94784266,
        0.4248309 ,  0.97811186,  1.7608812 ,  0.08076338,  1.0514913 ,
        1.4854838 ,  0.17836648,  0.02803503, -0.73165715, -0.6264381 ,
       -3.764019  ,  1.8128228 ,  1.8578556 , -2.208528  , -1.132535  ,
       -2.2616606 , -0.07818842,  0.10757745, -0.32016277,  0.6547073 ,
        0.70446897, -0.02661847,  0.38358182, -0.43814224, -0.42898545,
        0.12477661, -0.6984484 , -0.27979696,  2.0922875 ,  0.7715444 ,
        1.5044686 , -0.5870474 , -0.7020896 ,  1.7359827 ,  1.5990449 ,
       -0.4320323 ,  2.108935  , -0.41747302, -1.145535  ,  2.4129176 ,
        0.89127445, -0.42052403, -1.7306162 , -1.8634758 , -2.7774615 ,
        2.9477477 , -0.05811455, -0.3156799 ,  0.9396679 , -0.42951274,
        0.76986086, -1.3020213 ,  1.4949676 ,  0.7106628 , -0.631309  ,
       -0.04139144,  1.687128  ,  0.5391253 , -0.76657486, -0.29014868,
       -0.280084  , -0.98488003,  0.20266072, -1.1121765 ,  0.7734707 ,
       -0.9676994 , -0.8653948 ,  0.78150386,  1.8499094 , -0.14956334,
        1.8008482 , -0.393235  ,  0.32997358, -1.920201  ,  0.20352392,
        0.5000303 , -1.2298875 ,  0.8242096 ,  0.15986371,  1.2411798 ,
        0.37643033,  1.6649051 ,  0.76609766,  1.549611  , -0.75124633,
       -1.7489674 , -3.142393  , -2.14125   ,  0.21818882,  0.3581236 ],
      dtype=float32)