Now let’s try word2vec on the property details page and see if we can find something interesting.

load libs

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gensim
import nltk
import string
import re
%matplotlib inline

Load data

1 2	`with open('14_11_2019_rentals_raw.csv') as f: df = pd.read_csv(f)`

Basic data prepare

reformat the input

def data_clean(df):
    # price drop nan, should only contain numbers, drop the record if not.
    temp = df[~df['price'].isnull()]
    df = temp[temp['price'].str.isdigit()].reset_index(drop=True)
    # reformat some features
    df['price'] = df['price'].astype(float)
    # some price is not reasonable, drop them
    df = df.loc[df['price'] > 140,].reset_index(drop=True)
    df['suburb'] = df['address'].str.rsplit(',',1,expand=True)[1].str.strip()
    df['bond'] = df['bond'].str.split('$',expand=True)[1].str.replace(',','').astype(float)
    df['agent_suburb'] = df['agent_brand'].str.split('-',expand=True)[1]
    df['agent_brand'] = df['agent_brand'].str.split('-',expand=True)[0]
    df['agent_brand'] = df['agent_brand'].str.strip()
    temp = df['agent_name'].str.split(':',expand=True)
    replace = temp.loc[temp[0].str.lower() == 'agent',]
    old = temp.loc[temp[0].str.lower() != 'agent',]
    replace[0] = replace[1]
    df['agent_name'] = pd.concat([old,replace]).sort_index()[0]
    return df
df_clean = data_clean(df)

/Users/yaoyao/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:17: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

1	`df_clean.head()`

	property_type	price	bond	address	feature_bedroom	feature_bathroom	feature_parking	agent_brand	agent_name	available_date	property_details	suburb	agent_suburb
0	Apartment	450.0	1956.0	102/40 Bettina Street, Clayton	2	1	1	Harcourts	Steven Ker	Open Sat 16 Nov 11:45am View allAvailable 25 N...	First floor apartment with balcony consists of...	Clayton	Judd White
1	Townhouse	560.0	2433.0	15 Gentle Street, Clayton	3	2	2	Buxton	Talia Karagaslis	Available 20 Dec 2019	This beautiful three bedroom townhouse is loca...	Clayton	Oakleigh
2	Townhouse	590.0	2564.0	4 Main Road, Clayton	3	2	2	Ray White	Anthony Lu	Open today 4:45pm View allAvailable 22 Nov 2019	Only minutes to Clayton train station, bus sto...	Clayton	Oakleigh
3	Apartment	340.0	1477.0	7/126 Wellington Road, Clayton	2	1	1	Ray White	Anthony Lu	Open today 12:00pm View allAvailable 25 Nov 2019	Located within walking distance to Monash Univ...	Clayton	Oakleigh
4	Unit	475.0	2064.0	Address available on request, Clayton	3	1	2	Ray White	Johnson Tan	Available 01 Dec 2019	* Three generous-sized Bedrooms with Built-In ...	Clayton	Oakleigh

apply basic text cleaning for the property details.

The basic cleaning should include:

remove punctuations
convert to all lower cases
word tokenize
word lemmatization

def remove_punct(text):
    newtext = "".join([char for char in text if char not in string.punctuation])
    newtext = re.sub('[0-9\t/•]+', '', newtext)
    return newtext

def remove_stopword(text):
    newtext = [word for word in text if word not in nltk.corpus.stopwords.words('english')]
    return newtext

def apply_lemmatization(text):
    WNlemma = nltk.WordNetLemmatizer()
    newtext = [WNlemma.lemmatize(word) for word in text]
    return newtext

def text_clean(df_clean):
    df_clean['pd_punct'] = df_clean['property_details'].apply(lambda x: remove_punct(x)).str.lower()
    temp = df_clean['pd_punct'].apply(lambda x: nltk.word_tokenize(x))
    temp = temp.apply(lambda x: remove_stopword(x))
    df_clean['pd_final']  = temp.apply(lambda x: apply_lemmatization(x))

text_clean(df_clean)

Some EDA and Visualization

Word Cloud

def build_input(df):
    listall = []
    # list some common words that you don't want them present in the world cloud
    noshow = ['inspection','time','open','book','apartment','house','studio','email','agent',
              'cancellation','update','change','informed','property','inspect','townhouse']
    for line in df:
        line_All = " ".join(word for word in line if word not in noshow)
        listall.append(line_All)
    showall = "".join(line for line in listall)
    return showall

def plot_wordcloud(df_clean):
    from wordcloud import WordCloud, STOPWORDS , ImageColorGenerator
    df_apt = df_clean.loc[df_clean['property_type'] == 'Apartment','pd_final']
    df_townhouse = df_clean.loc[df_clean['property_type'] == 'Townhouse','pd_final']
    df_house = df_clean.loc[df_clean['property_type'] == 'House','pd_final']
    df_studio = df_clean.loc[df_clean['property_type'] == 'Studio','pd_final']

    list_apt = build_input(df_apt)
    list_townhouse = build_input(df_townhouse)
    list_house = build_input(df_apt)
    list_studio  = build_input(df_studio)

    fig, ax = plt.subplots(4, 1, figsize  = (30,40))
    wordcloud_apt = WordCloud(max_font_size=30, max_words=100, background_color="white") \
            .generate(list_apt)
    wordcloud_townhouse = WordCloud(max_font_size=30, max_words=100, background_color="white") \
            .generate(list_townhouse)
    wordcloud_house = WordCloud(max_font_size=30, max_words=100, background_color="white") \
            .generate(list_house)
    wordcloud_studio = WordCloud(max_font_size=30, max_words=100, background_color="white") \
            .generate(list_studio)
    ax[0].imshow(wordcloud_apt, interpolation='bilinear')
    ax[0].set_title("Common Words in Apartment Description",fontsize="20")
    ax[0].axis('off')
    ax[1].imshow(wordcloud_townhouse, interpolation='bilinear')
    ax[1].set_title("Common Words in Town House Description",fontsize="20")
    ax[1].axis('off')
    ax[2].imshow(wordcloud_house, interpolation='bilinear')
    ax[2].set_title("Common Words in House Description",fontsize="20")
    ax[2].axis('off')
    ax[3].imshow(wordcloud_studio, interpolation='bilinear')
    ax[3].set_title("Common Words in Studio Description",fontsize="20")
    ax[3].axis('off')

plot_wordcloud(df_clean)

This is a quite interesting plot I really like it.

the key words for house/townhouse/apartment/studio are quite different
It’s easy to spot some quite interesting pattern. eg. for studio, fully furnished is a key word not appearing in other types of properties.
train station, walking distance, steel appliance are quite common

Word frequency

def build_input_wf(df):
    listall = []
    # list some common words that you don't want them present in the world cloud
    noshow = ['inspection','time','open','book','apartment','house','studio','email','agent',
              'cancellation','update','change','informed','property','inspect','townhouse']
    for line in df:
        line_All = [word for word in line if word not in noshow]
        listall.extend(line_All)
    #showall = "".join(line for line in listall)
    return listall

def plot_word_frequency(df_clean):
    from nltk.probability import FreqDist
    df_apt = df_clean.loc[df_clean['property_type'] == 'Apartment','pd_final']
    df_townhouse = df_clean.loc[df_clean['property_type'] == 'Townhouse','pd_final']
    df_house = df_clean.loc[df_clean['property_type'] == 'House','pd_final']
    df_studio = df_clean.loc[df_clean['property_type'] == 'Studio','pd_final']

    list_apt = build_input_wf(df_apt)
    list_townhouse = build_input_wf(df_townhouse)
    list_house = build_input_wf(df_apt)
    list_studio  = build_input_wf(df_studio)
    frequency_house = FreqDist(list_house)

    return frequency_house.most_common(n=20)

1	`plot_word_frequency(df_clean)`

[('bedroom', 2769),
 ('melbourne', 1854),
 ('kitchen', 1713),
 ('living', 1494),
 ('area', 1438),
 ('bathroom', 1427),
 ('one', 1082),
 ('feature', 1052),
 ('space', 1032),
 ('room', 1030),
 ('city', 978),
 ('view', 943),
 ('located', 938),
 ('laundry', 932),
 ('building', 917),
 ('floor', 897),
 ('access', 895),
 ('appliance', 894),
 ('private', 878),
 ('street', 864)]

word2vec

build vocal

1	`wordlist = df_clean['pd_final'].to_list()`

model = gensim.models.Word2Vec(
        wordlist,
        size=150,
        window=10,
        min_count=2,
        workers=10,
        iter=10)

1 2	`w1 = 'train' model.wv.most_similar(positive=w1)`

[('bus', 0.9083044528961182),
 ('stop', 0.8881006240844727),
 ('railway', 0.8856533169746399),
 ('station', 0.812629759311676),
 ('route', 0.8058450818061829),
 ('springvale', 0.8043614625930786),
 ('tram', 0.7951110601425171),
 ('center', 0.7859233617782593),
 ('shopping', 0.7831301689147949),
 ('parliament', 0.781488299369812)]

1	`model.wv.__getitem__('train')`

array([-0.9055207 ,  0.3383803 , -1.0070474 ,  0.6496934 ,  0.5079288 ,
        0.2651052 ,  0.28885823, -0.37675577,  0.20077397,  2.2651038 ,
       -0.634687  , -0.12877388, -1.1244543 , -0.16395192, -1.1265266 ,
        1.987248  , -1.7640041 ,  0.0111573 ,  0.5782551 ,  0.3475309 ,
        3.0830522 ,  1.7851418 ,  1.8113701 ,  1.2043551 , -0.494675  ,
       -0.38805577, -0.84290814,  0.3232301 , -0.2537486 ,  0.6098439 ,
        0.87573195, -1.6538352 ,  0.7550999 , -0.18528049,  0.22006646,
       -0.05651824, -2.0897374 , -0.3931295 ,  2.0241232 ,  0.65036017,
        0.38358206,  0.40651393, -0.04384508,  1.4108607 ,  0.6118807 ,
       -0.19642885,  1.3095136 ,  0.7283475 ,  0.6006522 , -1.2531968 ,
        0.14988308,  1.6452932 , -0.70153093,  0.8074702 , -2.211348  ,
       -1.236841  ,  0.4659164 ,  0.16054933, -1.6786027 ,  0.94784266,
        0.4248309 ,  0.97811186,  1.7608812 ,  0.08076338,  1.0514913 ,
        1.4854838 ,  0.17836648,  0.02803503, -0.73165715, -0.6264381 ,
       -3.764019  ,  1.8128228 ,  1.8578556 , -2.208528  , -1.132535  ,
       -2.2616606 , -0.07818842,  0.10757745, -0.32016277,  0.6547073 ,
        0.70446897, -0.02661847,  0.38358182, -0.43814224, -0.42898545,
        0.12477661, -0.6984484 , -0.27979696,  2.0922875 ,  0.7715444 ,
        1.5044686 , -0.5870474 , -0.7020896 ,  1.7359827 ,  1.5990449 ,
       -0.4320323 ,  2.108935  , -0.41747302, -1.145535  ,  2.4129176 ,
        0.89127445, -0.42052403, -1.7306162 , -1.8634758 , -2.7774615 ,
        2.9477477 , -0.05811455, -0.3156799 ,  0.9396679 , -0.42951274,
        0.76986086, -1.3020213 ,  1.4949676 ,  0.7106628 , -0.631309  ,
       -0.04139144,  1.687128  ,  0.5391253 , -0.76657486, -0.29014868,
       -0.280084  , -0.98488003,  0.20266072, -1.1121765 ,  0.7734707 ,
       -0.9676994 , -0.8653948 ,  0.78150386,  1.8499094 , -0.14956334,
        1.8008482 , -0.393235  ,  0.32997358, -1.920201  ,  0.20352392,
        0.5000303 , -1.2298875 ,  0.8242096 ,  0.15986371,  1.2411798 ,
        0.37643033,  1.6649051 ,  0.76609766,  1.549611  , -0.75124633,
       -1.7489674 , -3.142393  , -2.14125   ,  0.21818882,  0.3581236 ],
      dtype=float32)