# An R script for examining Airbnb listing quantity and 
# longevity

# by Martin John Brown
# http://martinjohnbrown.net

# released under the Creative Commons 
# "Attribution-NonCommercial-ShareAlike 4.0 International"
# license: http://creativecommons.org/licenses/by-nc-sa/4.0/

# note: this script ran successfully for me under 
# R version 3.2.3 (2015-12-10), via RStudio 0.99.491 in
# late May 2016.  Your mileage may vary.

# In this R script I will collect airBnb listing data
# for various cities and examine it with two questions in mind:
# a) how are the total number of listings changing over time?
# b) how long-lived are individual listings?

# Professional coders should note I've never had any training
# in coding.  So it's probably messy by some people's standards.

# METHOD AND VOCABULARY
# The web site http://insideairbnb.com contains "scrapes"
# (or copies) of all live airbnb listing data in various cities
# on various dates.  Note that airbnb calls each listing
# a "room," but there are three "room types": shared room,
# private room, and entire home.
# When there is >1 scrape per city, and I know the date
# of each scrape, I can calculate the total number of rooms 
# of each type available on each scrape.  
# Graphing this over the multiple scrape dates will show how 
# the total number of listings per city are growing
# or shrinking.
# Furthermore, checking the appearance and disappearance of 
# individual "rooms" across
# multiple scrapes will allow me to estimate something 
# about the longevity of airbnb listings -- in particular,
# the proportion of airbnb listings that are likely to 
# persist over defined amounts of time.  Listings that do
# not persist as airbnb offerings may have reverted to 
# some other use.

# This script has a number of sections
## -loading R packages
## -creating a database of available scrape files
## -choosing the cities to study
## -downloading all those files and compiling them into
##  a single R data frame
## -listing all the pairs of scrape dates for each city
## -merging the scrape-date pairs with the actual data
## -calculating longevity from the scrape-date pairs.
## -creating datasets to use in charts
## -defining and printing the charts

## loading some R packages:
library("dplyr")
library("stringr")
library("data.table")
library("ggplot2")

## creating a list of available scrape files
# downloading data page HTML from insideAirbnb
availableDataFilesURL <- 
  "http://insideairbnb.com/get-the-data.html"
# splitting the filename part out of that URL
availableDataFilesFileSpec <- 
    str_split(availableDataFilesURL,"/")[[1]][4]
# downloading the file
download.file(
  availableDataFilesURL,
  availableDataFilesFileSpec
  )
# turning it into a single string so I can extract
# the data URLs
studyObject <- readChar(
  availableDataFilesFileSpec, 
  file.info(availableDataFilesFileSpec)$size
  )
# extracting all the data URLs
# sorry I am not great with regular expressions
# so I have to do this in several steps
dataURLlist <- 
    str_extract_all(
      studyObject,
      '"http://data.*listings.csv.gz"'
      )[[1]]
dataURLlist <-
  str_sub(
    dataURLlist,
    start=2,
    end=str_length(dataURLlist) -1
    )
# dissecting those URLs and getting info out of them
dataURLpieces <- str_split(dataURLlist,"/")
for (i in 1:length(dataURLpieces)) {
  fileSpec <- 
    dataURLpieces[[i]][9]
  scrapeDate <- 
    as.Date(dataURLpieces[[i]][7])
  city <- 
    dataURLpieces[[i]][6]
  region <-
    dataURLpieces[[i]][5]
  country <-
    dataURLpieces[[i]][4]
  URL <- dataURLlist[i]
  tempDF <- data.frame(
    fileSpec,
    city,
    scrapeDate,
    region,
    country,
    URL,
    stringsAsFactors = FALSE
    )
  if (i==1) {
    dataURLs <- tempDF[0,]
    } # closes the if statement
  dataURLs <- rbind(dataURLs,tempDF)
 }  # closes the for loop
# now I have "dataURLs" which is a data frame
# containing all the cities and scrapes and the 
# urls at which the scrapes can be obtained.
# here is a sample of what that the dataURLs table
# should look like..
# > head(dataURLs,2)
# fileSpec      city scrapeDate        region         country
# 1 listings.csv.gz amsterdam 2016-01-03 north-holland the-netherlands
# 2 listings.csv.gz amsterdam 2015-09-03 north-holland the-netherlands
# URL
# 1 http://data.insideairbnb.com/the-netherlands/north-holland/amsterdam/2016-01-03/data/listings.csv.gz
# 2 http://data.insideairbnb.com/the-netherlands/north-holland/amsterdam/2015-09-03/data/listings.csv.gz

## choosing the cities I want to study
# it is helpful to select only a few cities at a time to 
# study to prevent processing errors.  Depending on the 
# host machine, R may run out of memory to work with.  
# Moreover the csv data files provided by insideAirbnb do 
# not always have the same structure, and there are too many
# of them to write a custom import routine for each city
# and scrape.  In lieu of that I have used the R function
# fread(), which uses column names instead of locations 
# and can therefore adapt to some variations in the source
# data files.  However, a few cities still fail.
# 
# first, listing the number of scrapes by city
table(dataURLs$city)
# the output from table() should have looked 
# something like this..
# amsterdam           antwerp         asheville            athens 
# 4                 1                 1                 1 
# austin         barcelona            berlin            boston 
# 2                 5                 1                 1 
# brussels           chicago            dublin            london 
# 1                 1                 1                 3 
# los-angeles            madrid          mallorca         melbourne 
# 6                 3                 1                 5 
# montreal         nashville       new-orleans     new-york-city 
# 2                 2                 3                15 
# oakland             paris          portland         san-diego 
# 2                 3                 4                 1 
# san-francisco santa-cruz-county           seattle            sydney 
# 3                 1                 2                 5 
# toronto          trentino         vancouver            venice 
# 2                 1                 2                 1 
# vienna     washington-dc 
# 1                 1 
# 
# we are only concerned with cities 
# that have >=3 scrapes.
# let's pick a few to work with.
cityList <- which(
  dataURLs$city=="amsterdam"   #works
  |
#  dataURLs$city=="austin"    #works
#  |
#  dataURLs$city=="barcelona"  #fread error
#  |
  dataURLs$city=="london"  #works
  |
   dataURLs$city=="los-angeles"  # works
  |
    dataURLs$city=="madrid"   # works
  |
  dataURLs$city=="melbourne"  # works
  |
#   dataURLs$city=="nashville"  # works
#  |
  dataURLs$city=="new-orleans" #works
  |
#   dataURLs$city=="new-york-city"  # download/import error
#  |
#  dataURLs$city=="paris"  # fread error
#  |
  dataURLs$city=="portland"       #works
  |
  dataURLs$city=="san-francisco"  #works
  |
#  dataURLs$city=="seattle" #works
#  |
  dataURLs$city=="sydney" #works
#  |
#  dataURLs$city=="toronto"  #works
#  |
#   dataURLs$city=="vancouver"
  )

## dowloading all those scrape files, and putting
## them into a single R data frame.
# i use a loop to go through all the scrapes
# warning this may take a long time.
# and cause errors.
# you will need to check the results
for (j in cityList ) {
  # these details will identify the scrape
  sampleURL <- dataURLs[j,]$URL
  sampleURLfileSpec <- dataURLs[j,]$fileSpec
  city <- dataURLs[j,]$city
  scrapeDate <- dataURLs[j,]$scrapeDate
  # downloading the scrape file
  download.file(sampleURL,destfile=sampleURLfileSpec)
  # unzipping it
  R.utils::gunzip(sampleURLfileSpec, remove=TRUE,overwrite=TRUE)
  # i am assuming the unzipped file is called "listings.csv"
  # now reading in the unzipped file with data.table.
  # only getting two columns, "id" and "room_type"
  sampleData <- fread(
    "listings.csv",
    select = c("id","room_type")
    )
  tempScrapeFile <- data.table(sampleData,city,scrapeDate)
  if (j==cityList[1]) {
    theScrapeData <- tempScrapeFile[0,]
    } # closes that short if section
  theScrapeData <- rbind(theScrapeData,tempScrapeFile)  
  } # closes for loop j
# if that loop produced fread() errors in the form
# "Read less rows (100) than were allocated (200)" the 
# results are probably ok.  Other fread() errors should 
# be investigated.

# theScrapeData has the raw data for multiple cities.
# these are the 'positives,' the listings which were for
# offer on the scrape dates recorded.  It should look 
# something like this...
# > theScrapeData
# id       room_type      city scrapeDate
# 1: 7328003 Entire home/apt    madrid 2015-10-02
# 2: 6289024    Private room    madrid 2015-10-02
# 3: 3778195     Shared room    madrid 2015-10-02
# 4: 7056183    Private room    madrid 2015-10-02
# 5: 6847125    Private room    madrid 2015-10-02
# ---                                             
#   57362: 2224699    Private room melbourne 2015-07-18
# 57363:  628156    Private room melbourne 2015-07-18
# 57364: 3585560    Private room melbourne 2015-07-18
# 57365:  229502 Entire home/apt melbourne 2015-07-18
# 57366: 1605946    Private room melbourne 2015-07-18

# looking at theScrapeData to extract some useful 
# reference and lookup info:
# a) current cities of interest and their scrape dates
currentCitiesAndScrapes <- unique(
  select(theScrapeData,city,scrapeDate)
  %>%
  arrange(city,scrapeDate)
  )
# that table should look something like this:
# > currentCitiesAndScrapes
# city scrapeDate
# 1:    madrid 2015-07-17
# 2:    madrid 2015-09-04
# 3:    madrid 2015-10-02
# 4: melbourne 2015-07-18
# 5: melbourne 2015-09-03
# 6: melbourne 2015-10-02
# 7: melbourne 2015-12-03
# 8: melbourne 2016-01-03

# b) current cities of interest only
currentCities <- unique(currentCitiesAndScrapes$city)
# that should look something like this:
# > currentCities
# [1] "madrid"    "melbourne"

# c) a list of the pairs of scrape dates
# available for each city individually
# this is important because the persistence analysis
# needs to compare individual listings on pairs of 
# dates
for (k in currentCities) {
  cityDateList <- currentCitiesAndScrapes[
    which(currentCitiesAndScrapes$city==k),
    ]$scrapeDate
  cityDateCombos <- as.data.frame(
    t(combn(cityDateList,2))
    )
  cityDateCombos <- rename(
    cityDateCombos,
    scrapeDate1=V1,
    scrapeDate2=V2
    )
  cityDateCombos$scrapeDate1 <- 
    as.Date(cityDateCombos$scrapeDate1,origin="1970-01-01")
  cityDateCombos$scrapeDate2 <- 
    as.Date(cityDateCombos$scrapeDate2,origin="1970-01-01")
  cityDateCombos <- data.frame(city=k,cityDateCombos)
  if (k==currentCities[1]) {
    cityDateCombosAll <- cityDateCombos[0,]
    } # ends if clause
  cityDateCombosAll <- rbind(cityDateCombosAll,cityDateCombos)
  }  # ends for loop k
# note that cityDateCombosAll has all the conceivable pairs 
# of dates for each city.  It looks kinda like this...
# > cityDateCombosAll
# city scrapeDate1 scrapeDate2
# 1     madrid  2015-07-17  2015-09-04
# 2     madrid  2015-07-17  2015-10-02
# 3     madrid  2015-09-04  2015-10-02
# 4  melbourne  2015-07-18  2015-09-03
# 5  melbourne  2015-07-18  2015-10-02
# 6  melbourne  2015-07-18  2015-12-03
# 7  melbourne  2015-07-18  2016-01-03
# 8  melbourne  2015-09-03  2015-10-02
# 9  melbourne  2015-09-03  2015-12-03
# 10 melbourne  2015-09-03  2016-01-03
# 11 melbourne  2015-10-02  2015-12-03
# 12 melbourne  2015-10-02  2016-01-03
# 13 melbourne  2015-12-03  2016-01-03

## merging the scrape-date pairs with the observed
## scrape data.
# before I can start the merge I need to create and 
# work with a list of "rooms" (airbnb's name for
# listings) without any dates attached.
theScrapeRooms <- summarise(
  group_by(theScrapeData,city,id,room_type),
  noScrapes=n()
  )
theScrapeRooms$noScrapes <- NULL
# theScrapeRooms is the list of rooms.  it
# should look something like this...
# > theScrapeRooms
# Source: local data table [20,609 x 3]
# Groups: city, id
# 
# city      id       room_type
# (chr)   (int)           (chr)
# 1  madrid 7328003 Entire home/apt
# 2  madrid 6289024    Private room
# 3  madrid 3778195     Shared room
# 4  madrid 7056183    Private room
# 5  madrid 6847125    Private room
# 6  madrid 3453225    Private room
# 7  madrid  336869    Private room
# 8  madrid 5876946    Private room
# 9  madrid  316712    Private room
# 10 madrid 6584180    Private room

# there is one quirk in the list of rooms that 
# could screw up our later calculations.  there
# are a small number of airbnb hosts who have 
# changed the "room_type" of their listing over
# time.  For example they could change their listing
# from "private room" to "shared room". Since we will
# always want to know how the room types differ, these
# particular "rooms" are not informative.  We need to
# eliminate them from the study of persistence of 
# individual listings over time (though they don't need
# to be eliminated from study of the total number of 
# listings over time).  In the following code we will
# look for and eliminate those problematic listings.
# fortunately this seems to only affect about 1% of 
# the datasets in most cities so it should not throw
# off the results.
# looking to see if any rooms have been listed as >1
# room type
roomTypeCountPerRoom <- summarise(
  group_by(
    theScrapeRooms,
    city,
    id
    ),
  noRoomTypes=n()  
  )
# here are some typical results
# > table(
#     roomTypeCountPerRoom$city,
#     roomTypeCountPerRoom$noRoomTypes,
#     useNA = "ifany"
#     )
# 
#               1     2     3
# amsterdam 16115   137     1
# portland   3853    52     0
# 
# the rare rooms with >1 room type will cause problems 
# when it comes to calculating persistence.  The hosts
# were not offering the same accomodation each time.
# Since they look to be a very small percentage of rooms,
# I will just remove them from the list of rooms
theScrapeRooms <- merge(
  theScrapeRooms,
  roomTypeCountPerRoom,
  by=c("city","id"),
  all.x=TRUE
  ) %>% 
  filter(
    noRoomTypes==1
    ) %>%
  mutate(
    noRoomTypes=NULL
    )
# theScrapeRooms still has the same structure as before,
# just a few less rows of data, like this...
# > theScrapeRooms
# Source: local data table [20,338 x 3]
# 
# city    id       room_type
# (chr) (int)           (chr)
# 1  madrid 18628 Entire home/apt
# 2  madrid 19864 Entire home/apt
# 3  madrid 21512 Entire home/apt
# 4  madrid 21548 Entire home/apt
# 5  madrid 21853    Private room
# 6  madrid 22736 Entire home/apt
# 7  madrid 23021 Entire home/apt
# 8  madrid 24805 Entire home/apt
# 9  madrid 24836 Entire home/apt
# 10 madrid 26571    Private room

# now creating a record for each conceivable pair
# of dates in each room in each city.
for (l in 1:nrow(cityDateCombosAll)) {
  tempCity <- cityDateCombosAll[l,]$city
  scrapeDate1  <- cityDateCombosAll[l,]$scrapeDate1
  scrapeDate2  <- cityDateCombosAll[l,]$scrapeDate2
  tempRooms <- theScrapeRooms[theScrapeRooms$city==tempCity,]
  tempCombo <- data.frame(
    tempRooms,
    scrapeDate1,
    scrapeDate2
    )
  if (l==1) {
    roomDateCombos <- tempCombo[0,]
    }  # close if statement
  roomDateCombos <- rbind(roomDateCombos,tempCombo)
  } # close for loop l
# the result should look something like this..
# > head(roomDateCombos)
# city    id       room_type scrapeDate1 scrapeDate2
# 1 madrid 18628 Entire home/apt  2015-07-17  2015-09-04
# 2 madrid 19864 Entire home/apt  2015-07-17  2015-09-04
# 3 madrid 21512 Entire home/apt  2015-07-17  2015-09-04
# 4 madrid 21548 Entire home/apt  2015-07-17  2015-09-04
# 5 madrid 21853    Private room  2015-07-17  2015-09-04
# 6 madrid 22736 Entire home/apt  2015-07-17  2015-09-04

# now, doing the merge between all the possible rooms
# and dates and the ones that were actually observed
roomDateCombosInt1 <- merge(
  roomDateCombos,
  data.frame(theScrapeData,s1=1),
  by.x=c("city","id","room_type","scrapeDate1"),
  by.y=c("city","id","room_type","scrapeDate"),
  all.x=TRUE
  )
roomDateCombosInt2 <- merge(
  roomDateCombosInt1,
  data.frame(theScrapeData,s2=1),
  by.x=c("city","id","room_type","scrapeDate2"),
  by.y=c("city","id","room_type","scrapeDate"),
  all.x=TRUE
  )
# note roomDateCombosInt2 has all the conceivable combinations
# of room and scrapedate pairs.  This is the data set from
# which we can estimate persistence.  It should look
# something like this:
# > roomDateCombosInt2[52:57,]
# city    id       room_type scrapeDate2 scrapeDate1 s1 s2
# 52 madrid 30924    Private room  2015-09-04  2015-07-17  1  1
# 53 madrid 30924    Private room  2015-10-02  2015-07-17  1  1
# 54 madrid 30924    Private room  2015-10-02  2015-09-04  1  1
# 55 madrid 30959 Entire home/apt  2015-09-04  2015-07-17  1 NA
# 56 madrid 30959 Entire home/apt  2015-10-02  2015-07-17  1 NA
# 57 madrid 30959 Entire home/apt  2015-10-02  2015-09-04 NA NA

# before we can do the calculations, we need to fix
# in issue in the fields s1 and s2.
# all the current values of these fields are 
# are 1's (the listing is present and active) or 
# NA (the listing is not in the scrape).  
# however if the first scrape date is 
# 1 (present) then second scrapes of NA really should
# be marked as 0 (not active)
roomDateCombosInt3 <- mutate(
  roomDateCombosInt2,
  s2=ifelse(
    s1==1 & is.na(s2),
    0,
    s2
    )
  )
# the result should look something like this:
# > roomDateCombosInt3[52:57,]
# city    id       room_type scrapeDate2 scrapeDate1 s1 s2
# 52 madrid 30924    Private room  2015-09-04  2015-07-17  1  1
# 53 madrid 30924    Private room  2015-10-02  2015-07-17  1  1
# 54 madrid 30924    Private room  2015-10-02  2015-09-04  1  1
# 55 madrid 30959 Entire home/apt  2015-09-04  2015-07-17  1  0
# 56 madrid 30959 Entire home/apt  2015-10-02  2015-07-17  1  0
# 57 madrid 30959 Entire home/apt  2015-10-02  2015-09-04 NA NA

## calculating persistence (longevity) of listings
## using those scrape-date pairs
# now for each city, room_type, and scrapedate pair,
# i limit the observations to those where s1=1.
# and sum s1 and s2.
# this allows me to calculate the exponential
# "decay" in the availability of room types
persistenceSummaryByCityRoomTypeDatePair <- summarise(
  group_by(
    roomDateCombosInt3[which(roomDateCombosInt3$s1==1),],
    city,
    room_type,
    scrapeDate1,
    scrapeDate2
    ),
  startnum=sum(s1), # no. rooms available on scrapeDate1
  endnum=sum(s2), # no. of those available on scrapeDate2
  # now, using the standard exponential decay model
  # on this data.
  # y=a(1-r)^t
  # where
  # y=amount after time t
  # a=initial amount
  # r=rate of decay
  # t=time units
  # below I will use these variable names
  # a, the initial amount, is by definition 1 (I am using
  # proportions not absolute numbers of units)
  # y is rawPersist, the number available on scrapeDate1
  # that were also available on scrapeDate2
  rawPersist=endnum/startnum,
  # t is intDays, the interval of days between the dates
  intDays=as.integer(mean(scrapeDate2)-mean(scrapeDate1)),
  # r is decayDay, the rate of decay per day
  decayDay = 1-exp((log(rawPersist))/intDays),
  # persistCheck should be the same as rawPersist, it
  # is just a math check
  persistCheck =(1-decayDay)**intDays,
  # persistYr is the amount remaining after a year
  persistYr =(1-decayDay)**365
  )
# this data frame,
# persistenceSummaryByCityRoomTypeDatePair,
# contains the basic results about unit
# persistence. persistYr is the projected proportion of 
# units remaining after a year.
# it should look something like this
# > head(as.data.frame(persistenceSummaryByCityRoomTypeDatePair),3)
# city       room_type scrapeDate1 scrapeDate2 startnum endnum rawPersist
# 1 madrid Entire home/apt  2015-07-17  2015-09-04     4518   4073  0.9015051
# 2 madrid Entire home/apt  2015-07-17  2015-10-02     4518   3756  0.8313413
# 3 madrid Entire home/apt  2015-09-04  2015-10-02     4634   4219  0.9104445
# intDays    decayDay persistCheck persistYr
# 1      49 0.002113877    0.9015051 0.4619117
# 2      77 0.002396019    0.8313413 0.4166135
# 3      28 0.003345189    0.9104445 0.2943339
# note for each city and room type, there are as many 
# estimates of persistYr as there are scrape intervals.
# the estimates should probably be averaged or otherwise 
# summarized for better reliability.

## creating datasets for the charts.
# in one chart i want to show how the decay rate
# we just calculated plays out over the months of 
# a year (each month=365/12 days)
# so for each row of the results i expand out to a 12-month
# span...
for (i in 1:nrow(persistenceSummaryByCityRoomTypeDatePair)) {
  for (j in 0:12) {
    city <- persistenceSummaryByCityRoomTypeDatePair[i,]$city
    room_type <- persistenceSummaryByCityRoomTypeDatePair[i,]$room_type
    scrapeDate1 <- persistenceSummaryByCityRoomTypeDatePair[i,]$scrapeDate1
    scrapeDate2 <- persistenceSummaryByCityRoomTypeDatePair[i,]$scrapeDate2
    intDays <- persistenceSummaryByCityRoomTypeDatePair[i,]$intDays
    decayDay <- persistenceSummaryByCityRoomTypeDatePair[i,]$decayDay
    months <- j
    pctRem <- round(100*((1-decayDay)**(j*365/12)),1) 
    tempDF <- data.frame(
      city,
      room_type,
      scrapeDate1,
      scrapeDate2,
      intDays,
      decayDay,
      months,
      pctRem 
      )
    if (i==1 & j==0) {
      persistenceGraphData <- tempDF[0,] 
      } #close if statement
    persistenceGraphData <- rbind(
      persistenceGraphData,
      tempDF
    )
  }  #closes loop j
}  #closes loop i
# here's what persistenceGraphData should look like..
# > head(persistenceGraphData)
# city       room_type scrapeDate1 scrapeDate2 intDays    decayDay months
# 1 madrid Entire home/apt  2015-07-17  2015-09-04      49 0.002113877      0
# 2 madrid Entire home/apt  2015-07-17  2015-09-04      49 0.002113877      1
# 3 madrid Entire home/apt  2015-07-17  2015-09-04      49 0.002113877      2
# 4 madrid Entire home/apt  2015-07-17  2015-09-04      49 0.002113877      3
# 5 madrid Entire home/apt  2015-07-17  2015-09-04      49 0.002113877      4
# 6 madrid Entire home/apt  2015-07-17  2015-09-04      49 0.002113877      5
# pctRem
# 1  100.0
# 2   93.8
# 3   87.9
# 4   82.4
# 5   77.3
# 6   72.5

# i also want to make a graph of the growth in airbnb numbers.
# for each city, room_type, and scrape date, I want to show
# the number of units.  However, I want them to be colored by
# their first date of observation.
# creating the data for that graph
theGrowthGraphScrapes <- merge(
  theScrapeData,
  theScrapeRooms,
  by=c("city","id","room_type"),
  all.y=TRUE
)
theGrowthGraphFirstScrapes <-
  summarise(
    group_by(theGrowthGraphScrapes,city,id,room_type),
    firstScrape=min(scrapeDate)
  )
theGrowthGraphDetailData <- merge(
  theGrowthGraphScrapes,
  theGrowthGraphFirstScrapes,
  by=c("city","id","room_type"),
  all.x=TRUE
)
theGrowthGraphSummaryData <- summarise(
  group_by(
    theGrowthGraphDetailData,
    city,
    room_type,
    firstScrape,
    scrapeDate
  ),
  noUnits = n()
  ) %>%
  arrange(
    city,
    room_type,
    desc(firstScrape),
    scrapeDate
  )
# here's what theGrowthGraphSummaryData should look like...
# > head(theGrowthGraphSummaryData)
# city       room_type firstScrape scrapeDate noUnits
# 1 madrid Entire home/apt  2015-07-17 2015-07-17    4518
# 2 madrid Entire home/apt  2015-07-17 2015-09-04    4073
# 3 madrid Entire home/apt  2015-07-17 2015-10-02    3756
# 4 madrid Entire home/apt  2015-09-04 2015-09-04     561
# 5 madrid Entire home/apt  2015-09-04 2015-10-02     490
# 6 madrid Entire home/apt  2015-10-02 2015-10-02     324

## defining and printing the charts
# for each city, drawing two graphs,
# one about growth by "cohort" and the next about 
# decay in the # of listings
for (n in currentCities) {
  # first drawing the growth bar graph
  # with bars colored by initial dates
  png(
    filename=paste(
      "charts",
      paste(n,"-growth.png",sep=""),
      sep="/"
      ),
    width=555,
    height=400
    ) #defines a graphic device to print into
  print(
    ggplot() +
      geom_bar(
        data=theGrowthGraphSummaryData[
          which(theGrowthGraphSummaryData$city==n),
          ],
        aes(
          x=scrapeDate,
          y=noUnits,
          colour=factor(firstScrape)
          ,
          fill=factor(firstScrape)
          ),
        stat="identity",
        position="stack",
        alpha=0.7
        ) +
      geom_text(
        data=data.frame(
          room_type=c("Shared room","Shared room"),
          thingToPrint=c(
            "data source:\ninsideairbnb.com",
            "analysis:\nmartinjohnbrown.net"),
          xpz=c(as.Date("2015-12-31","2015-12-31")),
          ypz=c(
            max(
              theGrowthGraphSummaryData[
                which(
                  theGrowthGraphSummaryData$city==n
                  )
                ,]
              $noUnits
              )/2,
            max(
              theGrowthGraphSummaryData[
                which(
                  theGrowthGraphSummaryData$city==n
                )
                ,]
              $noUnits
              )/4
            )
        ),
        aes(label=thingToPrint,x=xpz,y=ypz),
        fontface="italic",
        hjust="right",
        lineheight=0.8,
        nudge_x=0.5,
        colour="gray40"
      ) +
      facet_grid(facets = .~room_type) +
      theme(
        legend.justification=c(1,1),
        legend.position=c(0.98,0.98),
        legend.key.size=unit(1.3,"line"),
        axis.title.x=element_text(
          face="italic",
          size=14,
          color="gray50"),
        axis.title.y=element_text(
          face="italic",
          size=14,
          color="gray50"),
        axis.text.x=element_text(
          angle=-90, 
          hjust=1, 
          vjust = 0.2,
          face="bold",
          size=10,
          color="gray50"),
        axis.text.y=element_text(
          face="bold",
          size=10,
          color="gray50"),
        plot.title=element_text(size=16,face="bold")
      ) +
      scale_x_date(name="Scrape Date", date_labels="%m/%y") +
      scale_y_continuous(name="Number of airbnb listings") +
      scale_colour_discrete(name="Date first observed") +
      scale_fill_discrete(name="Date first observed") +
      ggtitle(
        paste(n,
              ":\ngrowth and persistence of Airbnb listings",
              sep=""
              )
        ) #closes ggtitle
    # end of ggplot command sequence for this chart
  ) # closes print statement for this chart
  dev.off()  # closes print device for this chart
  
  # now, drawing longevity decay curves
  png(
    filename=paste(
      "charts",
      paste(n,"-persistence.png",sep=""),
      sep="/"
    ),
    width=555,
    height=400
  ) #defines a graphic device to print into
  
  print(
ggplot() +
  geom_point(
    data=persistenceGraphData[
      which(persistenceGraphData$city==n),
      ],
    aes(
      x=jitter(months),
      y=jitter(pctRem),
      colour=room_type,
      shape=room_type
      ),
    size=2,
    stroke=1.2,
    alpha=0.7
  ) +
  geom_line(
    data=summarise(
      group_by(
        persistenceGraphData[
          which(persistenceGraphData$city==n),
          ],
        months,
        room_type
        ),
      pctRem=mean(pctRem)
    ),
    aes(x=months,y=pctRem, colour=room_type),
    size=2,
    alpha=0.7
    ) +
  geom_text(
    data=data.frame(
      thingToPrint=c(
        "data source:\ninsideairbnb.com",
        "analysis:\nmartinjohnbrown.net"),
      xpz=c(-1,-1),
      ypz=c(15,4)
    ),
    aes(label=thingToPrint,x=xpz,y=ypz),
    fontface="italic",
    hjust="left",
    lineheight=0.8,
    nudge_x=0,
    colour="gray40"
   ) +
  scale_x_continuous(
    name="Months from first appearance",
    breaks=c(0,3,6,9,12)
  ) +
  scale_y_continuous(
    name="Percentage of listings still active",
    breaks=seq(0,100,20)
  ) +
  expand_limits(y=c(0,100)) +
  scale_shape_manual(values=c(1,3,4)) +
  theme(
    legend.justification=c(1,1),
    legend.position=c(0.98,0.98),
    legend.key.size=unit(1.3,"line"),
    plot.title=element_text(size=16,face="bold"),
    axis.title.x=element_text(
      face="italic",
      size=14,
      color="gray50"
      ),
    axis.title.y=element_text(
      face="italic",
      size=14,
      color="gray50"),
    axis.text.y=element_text(
      face="bold",
      size=10,
      color="gray50"),
    axis.text.x=element_text(
      face="bold",
      size=10,
      color="gray50")
    ) +
  ggtitle(
    paste(
          n,
          ":\nmodeled longevity of airbnb listings",
          sep=""
          )
   ) # closes ggtitle and ggplot command for this graph
  ) # closes print statement for this graph
  dev.off()  # closes graphic device for this graph
} #closes for statement n, related to city

# calculating and some summary statistics
# which I'll need for graphs comparing cities
persistenceMeanByCityRoomType <- summarise(
  group_by(
    persistenceSummaryByCityRoomTypeDatePair,
    city,
    room_type
    ),
  meanPersistYr = mean(persistYr)
  )

# entering some city populations from wikipedia
cityPops <- data.frame(
  city=c(
   "amsterdam",
   "london",
   "los-angeles",
   "madrid",
   "melbourne",
   "new-orleans",
   "portland",
   "san-francisco",
   "sydney"
    ),
  urbanPop= c(   #urban area population from wikipedia
    1335115,
    9787426,
    12150996,
    6183000,
    3707530,
    389617,
    1849898,
    864816,
    3908642
    ),
  maPop= c(    #metropolitan area population from wikipedia
    2431000,
    13879757,
    13131431,
    6489162,
    4529496,
    1262888,
    2389228,
    4656132,
    4920970
    )
  )

# adding up the rooms in each city and including population
airBnbUnitCounts <- merge(
  summarize(
    group_by(
      theGrowthGraphSummaryData,
      city,
      room_type,
      scrapeDate
      ),
    noUnits=sum(noUnits)
    ) ,
  cityPops,
  by="city"
  )
mostRecentScrape <- summarise(
  group_by(
    airBnbUnitCounts,
    city,
    room_type
    ),
  latestScrape =max(scrapeDate)
  )
airBnbUnitCountsLatest <-
  merge(
    airBnbUnitCounts,
    mostRecentScrape,
    by.x=c("city","room_type","scrapeDate"),
    by.y=c("city","room_type","latestScrape"),
    all.y=TRUE
    )
airBnbUnitCountsLatestWithRatios <-
  mutate(
    airBnbUnitCountsLatest,
    unitsPerUrbanPop=noUnits/urbanPop,
    unitsPerMAPop=noUnits/maPop,
    urbanPopPerUnit=round(urbanPop/noUnits),
    MAPopPerUnit=round(maPop/noUnits)
    )
    
# i want to make a bar chart that shows the number
# of "entire home" listings per city on the most 
# recent scrape.  i would also like to note the scrape
# date on the graph somehow.
png(
  filename=paste(
    "charts",
    paste("recent-unit-totals-in-9-cities.png",sep=""),
    sep="/"
    ),
  width=555,
  height=400
  ) #defines a graphic device to print into

ggplot(
  data=filter(
    airBnbUnitCountsLatestWithRatios,
    room_type=="Entire home/apt"
    )
  ) +
  geom_bar(
    aes(x=city,y=noUnits,colour=city,fill=city),
    stat="identity",
    alpha=0.7
    ) +
  geom_text(
    aes(label=city,x=city,y=(noUnits+200)),
    hjust=0.5,
    vjust=0,
    color="black"
    ) +
  geom_text(
    aes(
      label=as.character(scrapeDate,"%b\n%Y"),
      x=city,
      y=(noUnits/2)
      ),
    angle=90,
    color="gray50",
    hjust="center",
    fontface="italic"
    ) + 
  geom_text(
    data=data.frame(
      thingToPrint=c(
        "data source:\ninsideairbnb.com",
        "analysis:\nmartinjohnbrown.net"),
      xpz=c("sydney","sydney"),
      ypz=c(17500,16000)
      ),
    aes(label=thingToPrint,x=xpz,y=ypz),
    fontface="italic",
    hjust="right",
    nudge_x=0.5,
    lineheight=0.8,
    colour="gray40"
    ) +
 scale_y_continuous(
   name="Active 'entire home' listings"
   ) +
  theme(
    axis.text.x=element_blank(),
    axis.text.y=element_text(
      face="bold",
      size=10,
      color="gray50"),
    axis.title.x=element_blank(),
    axis.title.y=element_text(
      face="italic",
      size=14,
      color="gray50"),
    legend.position="none",
    plot.title=element_text(size=16,face="bold")
    ) +
  ggtitle("Quantity of 'entire home' Airbnb listings in 9 global cities")
dev.off()  # turns off graphic device

# i want to make a bar chart that shows the number
# of "entire home" listings per city on the most 
# recent scrape PER 1000 URBAN POPULATION.
# i would also like to note the scrape
# date on the graph somehow.
png(
  filename=paste(
    "charts",
    paste("units-per-1000-urban-pop.png",sep=""),
    sep="/"
  ),
  width=555,
  height=400
) #defines a graphic device to print into

ggplot(
  data=filter(
    airBnbUnitCountsLatestWithRatios,
    room_type=="Entire home/apt"
    )
  ) +
  geom_bar(
    aes(
      x=city,
      y=(unitsPerUrbanPop*1000),
      colour=city,
      fill=city
      ),
    stat="identity",
    alpha=0.7
  ) +
  geom_text(
    aes(label=city,x=city,y=(unitsPerUrbanPop*1000+0.1)),
    hjust=0.5,
    vjust=0,
    color="black"
  ) +
  geom_text(
    aes(
      label=as.character(scrapeDate,"%b\n%Y"),
      x=city,
      y=(unitsPerUrbanPop*1000/2)
      ),
    angle=90,
    color="gray50",
    hjust="center",
    fontface="italic"
    ) + 
  geom_text(
    data=data.frame(
      thingToPrint=c(
        "data source:\ninsideairbnb.com",
        "analysis:\nmartinjohnbrown.net"),
      xpz=c("sydney","sydney"),
      ypz=c(7.0,6.3)
    ),
    aes(label=thingToPrint,x=xpz,y=ypz),
    fontface="italic",
    hjust="right",
    lineheight=0.8,
    nudge_x=0.5,
    colour="gray40"
    ) +
  scale_y_continuous(
    name="Number of 'entire home' AirBnb listings\nper 1000 urban population"
    ) +
  theme(
    axis.text.x=element_blank(),
    axis.text.y=element_text(
      face="bold",
      size=10,
      color="gray50"),
    axis.title.x=element_blank(),
    axis.title.y=element_text(
      face="italic",
      size=14,
      color="gray50"),
    legend.position="none",
    plot.title=element_text(size=16,face="bold")
    ) +
  ggtitle("'Entire home' AirBnb listings per urban population")
dev.off()  # turns off graphic device


# i want to make a bar chart that shows the number
# of "entire home" listings per city on the most 
# recent scrape PER 1000 METROPOLITAN POPULATION.
# i would also like to note the scrape
# date on the graph somehow.
png(
  filename=paste(
    "charts",
    paste("units-per-1000-metro-pop.png",sep=""),
    sep="/"
  ),
  width=555,
  height=400
) #defines a graphic device to print into

ggplot(
  data=filter(
    airBnbUnitCountsLatestWithRatios,
    room_type=="Entire home/apt"
    )
  ) +
  geom_bar(
    aes(
      x=city,
      y=(unitsPerMAPop*1000),
      colour=city,
      fill=city
    ),
    stat="identity",
    alpha=0.7
  ) +
  geom_text(
    aes(label=city,x=city,y=(unitsPerMAPop*1000+0.05)),
    hjust=0.5,
    vjust=0,
    color="black"
  ) +
  geom_text(
    aes(
      label=as.character(scrapeDate,"%b\n%Y"),
      x=city,
      y=(unitsPerMAPop*1000/2)
    ),
    angle=90,
    color="gray50",
    hjust="center",
    fontface="italic"
  ) + 
  geom_text(
    data=data.frame(
      thingToPrint=c(
        "data source:\ninsideairbnb.com",
        "analysis:\nmartinjohnbrown.net"),
      xpz=c("sydney","sydney"),
      ypz=c(3.7,3.4)
    ),
    aes(label=thingToPrint,x=xpz,y=ypz),
    fontface="italic",
    hjust="right",
    lineheight=0.8,
    nudge_x=0.5,
    colour="gray40"
  ) +
  scale_y_continuous(
    name="Number of 'entire home' AirBnb listings\nper 1000 metropolitan area population"
    ) +
  theme(
    axis.text.x=element_blank(),
    axis.text.y=element_text(
      face="bold",
      size=10,
      color="gray50"),
    axis.title.x=element_blank(),
    axis.title.y=element_text(
      face="italic",
      size=14,
      color="gray50"),
    legend.position="none",
    plot.title=element_text(size=16,face="bold")
  ) +
  ggtitle("'Entire home' AirBnb listings per metro population")
dev.off()  # turns off graphic device


# i want to make a bar chart that shows the modelled
# persistence of "entire home" listings per city.
png(
  filename=paste(
    "charts",
    paste("modelled-persistence-in-9-cities.png",sep=""),
    sep="/"
  ),
  width=555,
  height=400
) #defines a graphic device to print into

ggplot(
  ) +
  geom_jitter(
    data=filter(
      persistenceSummaryByCityRoomTypeDatePair,
      room_type=="Entire home/apt"
      ),
    aes(
      x=city,
      y=persistYr*100,
      colour=city,
      fill=city
      )
    ) +
  geom_bar(
    data=filter(
      persistenceMeanByCityRoomType,
      room_type=="Entire home/apt"
      ),
    aes(
      x=city,
      y=meanPersistYr*100,
      colour=city,
      fill=city
      ),
    stat="identity",
    alpha=0.4
    ) +
  geom_text(
    data=filter(
      persistenceMeanByCityRoomType,
      room_type=="Entire home/apt"
      ),
    aes(label=city,x=city,y=(meanPersistYr*100+1)),
    hjust=0.5,
    vjust=0,
    color="black"
    ) +
  geom_text(
    data=data.frame(
      thingToPrint=c(
        "data source:\ninsideairbnb.com",
        "analysis:\nmartinjohnbrown.net"),
      xpz=c("sydney","sydney"),
      ypz=c(97,90)
    ),
    aes(label=thingToPrint,x=xpz,y=ypz),
    fontface="italic",
    hjust="right",
    lineheight=0.8,
    nudge_x=0.5,
    colour="gray40"
    ) +
  scale_y_continuous(
    name="Modeled percentage of 'entire home' AirBnb listings \nstill active after 1 year"
    ) +
  expand_limits(y=c(0,100))+
  theme(
    axis.text.x=element_blank(),
    axis.text.y=element_text(
      face="bold",
      size=10,
      color="gray50"),
    axis.title.x=element_blank(),
    axis.title.y=element_text(
      face="italic",
      size=14,
      color="gray50"),
    legend.position="none",
    plot.title=element_text(size=16,face="bold")
  ) +
  ggtitle("Modeling based on repeated 'scrapes' suggests\nmany AirBnb listings are active less than 1 year")
dev.off()  # turns off graphic device

# i want to make a  chart that shows growth in 
# airbnb in all cities on a common scale --
# the number of units observed on the first
# scrape.
# don't know why the chain operator isn't working for
# me right now.  so i am going to do the following really 
# inefficiently.
tempDF <- 
  as.data.frame(
  filter(
    theGrowthGraphSummaryData,
    room_type=="Entire home/apt"
    ) 
  )
tempDF <-
  arrange(
    tempDF,
    city,
    room_type,
    scrapeDate,
    firstScrape
  )
tempDF <- 
  summarise(
    group_by(
      tempDF,
      city,
      room_type,
      scrapeDate
      ),
    totUnits = sum(noUnits)
    )
tempDF2 <-
  summarise(
    group_by(tempDF,city,room_type),
    firstDate=first(scrapeDate),
    firstCount=first(totUnits)
    )
entireHomesGrowthSummary <-
  merge(
    as.data.frame(
      filter(
        theGrowthGraphSummaryData,
        room_type=="Entire home/apt"
        )
      ),
    as.data.frame(tempDF2),
    by.x=c("city","room_type"),
    by.y=c("city","room_type"),
    all.x=TRUE
    ) %>%
  mutate(
    pctOfFirst=noUnits/firstCount
    )

entireHomesGrowthSummaryPlus <-
  rbind(
    data.frame(
      entireHomesGrowthSummary,
      thingy="growth in total number of 'entire home' listings"
    ),
    data.frame(
      filter(
        entireHomesGrowthSummary,
        firstScrape==firstDate
        ),
      thingy="decline among listings active on first scrape"
      )
    )
entireHomesGrowthSummaryPlusSummed <-
  summarise(
    group_by(
      entireHomesGrowthSummaryPlus,
      thingy,
      city,
      room_type,
      scrapeDate
      ),
    totUnits=sum(noUnits),
    firstUnits=mean(firstCount)
    )  %>%
  mutate(
    pctUnits=100*totUnits/firstUnits
    )

# open a graphic device
png(
  filename=paste(
    "charts",
    paste("growth-in-9-cities.png",sep=""),
    sep="/"
  ),
  width=555,
  height=400
) #defines a graphic device to print into

ggplot()+
  geom_point(
    data=entireHomesGrowthSummaryPlusSummed,
    aes(
      x=scrapeDate,
      y=pctUnits,
      colour=city,
      fill=city
      ),
    size=3,
    alpha=0.6
    ) +
  geom_line(
    data=entireHomesGrowthSummaryPlusSummed,
    aes(
      x=scrapeDate,
      y=pctUnits,
      colour=city,
      fill=city
      ),
    size=1,
    alpha=0.6
    ) +
  geom_text(
    data=data.frame(
      thingy=c(
        "decline among listings active on first scrape",
        "decline among listings active on first scrape"
        ),
      thingToPrint=c(
        "data source:\ninsideairbnb.com",
        "analysis:\nmartinjohnbrown.net"),
      xpz=as.Date(c("2016-04-30","2016-04-30")),
      ypz=c(157,145)
    ),
    aes(label=thingToPrint,x=xpz,y=ypz),
    fontface="italic",
    hjust="right",
    lineheight=0.8,
    nudge_x=0.5,
    color="gray40"
  ) +
  facet_grid(facets=.~thingy) +
  ggtitle("Growth and decline in AirBnb listings in nine global cities") +
  guides(
    fill=guide_legend(ncol=2),
    color=guide_legend(ncol=2)
    ) +
  scale_x_date(
    name="Scrape Date", 
    date_labels="%m/%y"
    ) +
  scale_y_continuous(
    name="Number of 'entire home' Airbnb listings\n(as percentage of initial count)"
    ) +
  theme(
    axis.text.x=element_text(
#       angle=90, 
#       hjust=0, 
#       vjust = 0.2,
      face="bold",
      size=10,
      color="gray50"),
    axis.text.y=element_text(
      face="bold",
      size=10,
      color="gray50"),
    axis.title.x=element_text(
      face="italic",
      size=14,
      color="gray50"),
    axis.title.y=element_text(
      face="italic",
      size=14,
      color="gray50"),
    legend.justification=c(0,0),
    legend.key.size=unit(0.5,"line"),
    legend.position=c(0.09,0.03),
    legend.title.align=0.5,
    plot.title=element_text(size=16,face="bold"),
    strip.text.x=element_text(size=10,face="bold.italic")
    )
# end of ggplot command sequence for this chart
dev.off() # closes graphic device