# An R script for examining Airbnb listing quantity and # longevity # by Martin John Brown # http://martinjohnbrown.net # released under the Creative Commons # "Attribution-NonCommercial-ShareAlike 4.0 International" # license: http://creativecommons.org/licenses/by-nc-sa/4.0/ # note: this script ran successfully for me under # R version 3.2.3 (2015-12-10), via RStudio 0.99.491 in # late May 2016. Your mileage may vary. # In this R script I will collect airBnb listing data # for various cities and examine it with two questions in mind: # a) how are the total number of listings changing over time? # b) how long-lived are individual listings? # Professional coders should note I've never had any training # in coding. So it's probably messy by some people's standards. # METHOD AND VOCABULARY # The web site http://insideairbnb.com contains "scrapes" # (or copies) of all live airbnb listing data in various cities # on various dates. Note that airbnb calls each listing # a "room," but there are three "room types": shared room, # private room, and entire home. # When there is >1 scrape per city, and I know the date # of each scrape, I can calculate the total number of rooms # of each type available on each scrape. # Graphing this over the multiple scrape dates will show how # the total number of listings per city are growing # or shrinking. # Furthermore, checking the appearance and disappearance of # individual "rooms" across # multiple scrapes will allow me to estimate something # about the longevity of airbnb listings -- in particular, # the proportion of airbnb listings that are likely to # persist over defined amounts of time. Listings that do # not persist as airbnb offerings may have reverted to # some other use. # This script has a number of sections ## -loading R packages ## -creating a database of available scrape files ## -choosing the cities to study ## -downloading all those files and compiling them into ## a single R data frame ## -listing all the pairs of scrape dates for each city ## -merging the scrape-date pairs with the actual data ## -calculating longevity from the scrape-date pairs. ## -creating datasets to use in charts ## -defining and printing the charts ## loading some R packages: library("dplyr") library("stringr") library("data.table") library("ggplot2") ## creating a list of available scrape files # downloading data page HTML from insideAirbnb availableDataFilesURL <- "http://insideairbnb.com/get-the-data.html" # splitting the filename part out of that URL availableDataFilesFileSpec <- str_split(availableDataFilesURL,"/")[[1]][4] # downloading the file download.file( availableDataFilesURL, availableDataFilesFileSpec ) # turning it into a single string so I can extract # the data URLs studyObject <- readChar( availableDataFilesFileSpec, file.info(availableDataFilesFileSpec)$size ) # extracting all the data URLs # sorry I am not great with regular expressions # so I have to do this in several steps dataURLlist <- str_extract_all( studyObject, '"http://data.*listings.csv.gz"' )[[1]] dataURLlist <- str_sub( dataURLlist, start=2, end=str_length(dataURLlist) -1 ) # dissecting those URLs and getting info out of them dataURLpieces <- str_split(dataURLlist,"/") for (i in 1:length(dataURLpieces)) { fileSpec <- dataURLpieces[[i]][9] scrapeDate <- as.Date(dataURLpieces[[i]][7]) city <- dataURLpieces[[i]][6] region <- dataURLpieces[[i]][5] country <- dataURLpieces[[i]][4] URL <- dataURLlist[i] tempDF <- data.frame( fileSpec, city, scrapeDate, region, country, URL, stringsAsFactors = FALSE ) if (i==1) { dataURLs <- tempDF[0,] } # closes the if statement dataURLs <- rbind(dataURLs,tempDF) } # closes the for loop # now I have "dataURLs" which is a data frame # containing all the cities and scrapes and the # urls at which the scrapes can be obtained. # here is a sample of what that the dataURLs table # should look like.. # > head(dataURLs,2) # fileSpec city scrapeDate region country # 1 listings.csv.gz amsterdam 2016-01-03 north-holland the-netherlands # 2 listings.csv.gz amsterdam 2015-09-03 north-holland the-netherlands # URL # 1 http://data.insideairbnb.com/the-netherlands/north-holland/amsterdam/2016-01-03/data/listings.csv.gz # 2 http://data.insideairbnb.com/the-netherlands/north-holland/amsterdam/2015-09-03/data/listings.csv.gz ## choosing the cities I want to study # it is helpful to select only a few cities at a time to # study to prevent processing errors. Depending on the # host machine, R may run out of memory to work with. # Moreover the csv data files provided by insideAirbnb do # not always have the same structure, and there are too many # of them to write a custom import routine for each city # and scrape. In lieu of that I have used the R function # fread(), which uses column names instead of locations # and can therefore adapt to some variations in the source # data files. However, a few cities still fail. # # first, listing the number of scrapes by city table(dataURLs$city) # the output from table() should have looked # something like this.. # amsterdam antwerp asheville athens # 4 1 1 1 # austin barcelona berlin boston # 2 5 1 1 # brussels chicago dublin london # 1 1 1 3 # los-angeles madrid mallorca melbourne # 6 3 1 5 # montreal nashville new-orleans new-york-city # 2 2 3 15 # oakland paris portland san-diego # 2 3 4 1 # san-francisco santa-cruz-county seattle sydney # 3 1 2 5 # toronto trentino vancouver venice # 2 1 2 1 # vienna washington-dc # 1 1 # # we are only concerned with cities # that have >=3 scrapes. # let's pick a few to work with. cityList <- which( dataURLs$city=="amsterdam" #works | # dataURLs$city=="austin" #works # | # dataURLs$city=="barcelona" #fread error # | dataURLs$city=="london" #works | dataURLs$city=="los-angeles" # works | dataURLs$city=="madrid" # works | dataURLs$city=="melbourne" # works | # dataURLs$city=="nashville" # works # | dataURLs$city=="new-orleans" #works | # dataURLs$city=="new-york-city" # download/import error # | # dataURLs$city=="paris" # fread error # | dataURLs$city=="portland" #works | dataURLs$city=="san-francisco" #works | # dataURLs$city=="seattle" #works # | dataURLs$city=="sydney" #works # | # dataURLs$city=="toronto" #works # | # dataURLs$city=="vancouver" ) ## dowloading all those scrape files, and putting ## them into a single R data frame. # i use a loop to go through all the scrapes # warning this may take a long time. # and cause errors. # you will need to check the results for (j in cityList ) { # these details will identify the scrape sampleURL <- dataURLs[j,]$URL sampleURLfileSpec <- dataURLs[j,]$fileSpec city <- dataURLs[j,]$city scrapeDate <- dataURLs[j,]$scrapeDate # downloading the scrape file download.file(sampleURL,destfile=sampleURLfileSpec) # unzipping it R.utils::gunzip(sampleURLfileSpec, remove=TRUE,overwrite=TRUE) # i am assuming the unzipped file is called "listings.csv" # now reading in the unzipped file with data.table. # only getting two columns, "id" and "room_type" sampleData <- fread( "listings.csv", select = c("id","room_type") ) tempScrapeFile <- data.table(sampleData,city,scrapeDate) if (j==cityList[1]) { theScrapeData <- tempScrapeFile[0,] } # closes that short if section theScrapeData <- rbind(theScrapeData,tempScrapeFile) } # closes for loop j # if that loop produced fread() errors in the form # "Read less rows (100) than were allocated (200)" the # results are probably ok. Other fread() errors should # be investigated. # theScrapeData has the raw data for multiple cities. # these are the 'positives,' the listings which were for # offer on the scrape dates recorded. It should look # something like this... # > theScrapeData # id room_type city scrapeDate # 1: 7328003 Entire home/apt madrid 2015-10-02 # 2: 6289024 Private room madrid 2015-10-02 # 3: 3778195 Shared room madrid 2015-10-02 # 4: 7056183 Private room madrid 2015-10-02 # 5: 6847125 Private room madrid 2015-10-02 # --- # 57362: 2224699 Private room melbourne 2015-07-18 # 57363: 628156 Private room melbourne 2015-07-18 # 57364: 3585560 Private room melbourne 2015-07-18 # 57365: 229502 Entire home/apt melbourne 2015-07-18 # 57366: 1605946 Private room melbourne 2015-07-18 # looking at theScrapeData to extract some useful # reference and lookup info: # a) current cities of interest and their scrape dates currentCitiesAndScrapes <- unique( select(theScrapeData,city,scrapeDate) %>% arrange(city,scrapeDate) ) # that table should look something like this: # > currentCitiesAndScrapes # city scrapeDate # 1: madrid 2015-07-17 # 2: madrid 2015-09-04 # 3: madrid 2015-10-02 # 4: melbourne 2015-07-18 # 5: melbourne 2015-09-03 # 6: melbourne 2015-10-02 # 7: melbourne 2015-12-03 # 8: melbourne 2016-01-03 # b) current cities of interest only currentCities <- unique(currentCitiesAndScrapes$city) # that should look something like this: # > currentCities # [1] "madrid" "melbourne" # c) a list of the pairs of scrape dates # available for each city individually # this is important because the persistence analysis # needs to compare individual listings on pairs of # dates for (k in currentCities) { cityDateList <- currentCitiesAndScrapes[ which(currentCitiesAndScrapes$city==k), ]$scrapeDate cityDateCombos <- as.data.frame( t(combn(cityDateList,2)) ) cityDateCombos <- rename( cityDateCombos, scrapeDate1=V1, scrapeDate2=V2 ) cityDateCombos$scrapeDate1 <- as.Date(cityDateCombos$scrapeDate1,origin="1970-01-01") cityDateCombos$scrapeDate2 <- as.Date(cityDateCombos$scrapeDate2,origin="1970-01-01") cityDateCombos <- data.frame(city=k,cityDateCombos) if (k==currentCities[1]) { cityDateCombosAll <- cityDateCombos[0,] } # ends if clause cityDateCombosAll <- rbind(cityDateCombosAll,cityDateCombos) } # ends for loop k # note that cityDateCombosAll has all the conceivable pairs # of dates for each city. It looks kinda like this... # > cityDateCombosAll # city scrapeDate1 scrapeDate2 # 1 madrid 2015-07-17 2015-09-04 # 2 madrid 2015-07-17 2015-10-02 # 3 madrid 2015-09-04 2015-10-02 # 4 melbourne 2015-07-18 2015-09-03 # 5 melbourne 2015-07-18 2015-10-02 # 6 melbourne 2015-07-18 2015-12-03 # 7 melbourne 2015-07-18 2016-01-03 # 8 melbourne 2015-09-03 2015-10-02 # 9 melbourne 2015-09-03 2015-12-03 # 10 melbourne 2015-09-03 2016-01-03 # 11 melbourne 2015-10-02 2015-12-03 # 12 melbourne 2015-10-02 2016-01-03 # 13 melbourne 2015-12-03 2016-01-03 ## merging the scrape-date pairs with the observed ## scrape data. # before I can start the merge I need to create and # work with a list of "rooms" (airbnb's name for # listings) without any dates attached. theScrapeRooms <- summarise( group_by(theScrapeData,city,id,room_type), noScrapes=n() ) theScrapeRooms$noScrapes <- NULL # theScrapeRooms is the list of rooms. it # should look something like this... # > theScrapeRooms # Source: local data table [20,609 x 3] # Groups: city, id # # city id room_type # (chr) (int) (chr) # 1 madrid 7328003 Entire home/apt # 2 madrid 6289024 Private room # 3 madrid 3778195 Shared room # 4 madrid 7056183 Private room # 5 madrid 6847125 Private room # 6 madrid 3453225 Private room # 7 madrid 336869 Private room # 8 madrid 5876946 Private room # 9 madrid 316712 Private room # 10 madrid 6584180 Private room # there is one quirk in the list of rooms that # could screw up our later calculations. there # are a small number of airbnb hosts who have # changed the "room_type" of their listing over # time. For example they could change their listing # from "private room" to "shared room". Since we will # always want to know how the room types differ, these # particular "rooms" are not informative. We need to # eliminate them from the study of persistence of # individual listings over time (though they don't need # to be eliminated from study of the total number of # listings over time). In the following code we will # look for and eliminate those problematic listings. # fortunately this seems to only affect about 1% of # the datasets in most cities so it should not throw # off the results. # looking to see if any rooms have been listed as >1 # room type roomTypeCountPerRoom <- summarise( group_by( theScrapeRooms, city, id ), noRoomTypes=n() ) # here are some typical results # > table( # roomTypeCountPerRoom$city, # roomTypeCountPerRoom$noRoomTypes, # useNA = "ifany" # ) # # 1 2 3 # amsterdam 16115 137 1 # portland 3853 52 0 # # the rare rooms with >1 room type will cause problems # when it comes to calculating persistence. The hosts # were not offering the same accomodation each time. # Since they look to be a very small percentage of rooms, # I will just remove them from the list of rooms theScrapeRooms <- merge( theScrapeRooms, roomTypeCountPerRoom, by=c("city","id"), all.x=TRUE ) %>% filter( noRoomTypes==1 ) %>% mutate( noRoomTypes=NULL ) # theScrapeRooms still has the same structure as before, # just a few less rows of data, like this... # > theScrapeRooms # Source: local data table [20,338 x 3] # # city id room_type # (chr) (int) (chr) # 1 madrid 18628 Entire home/apt # 2 madrid 19864 Entire home/apt # 3 madrid 21512 Entire home/apt # 4 madrid 21548 Entire home/apt # 5 madrid 21853 Private room # 6 madrid 22736 Entire home/apt # 7 madrid 23021 Entire home/apt # 8 madrid 24805 Entire home/apt # 9 madrid 24836 Entire home/apt # 10 madrid 26571 Private room # now creating a record for each conceivable pair # of dates in each room in each city. for (l in 1:nrow(cityDateCombosAll)) { tempCity <- cityDateCombosAll[l,]$city scrapeDate1 <- cityDateCombosAll[l,]$scrapeDate1 scrapeDate2 <- cityDateCombosAll[l,]$scrapeDate2 tempRooms <- theScrapeRooms[theScrapeRooms$city==tempCity,] tempCombo <- data.frame( tempRooms, scrapeDate1, scrapeDate2 ) if (l==1) { roomDateCombos <- tempCombo[0,] } # close if statement roomDateCombos <- rbind(roomDateCombos,tempCombo) } # close for loop l # the result should look something like this.. # > head(roomDateCombos) # city id room_type scrapeDate1 scrapeDate2 # 1 madrid 18628 Entire home/apt 2015-07-17 2015-09-04 # 2 madrid 19864 Entire home/apt 2015-07-17 2015-09-04 # 3 madrid 21512 Entire home/apt 2015-07-17 2015-09-04 # 4 madrid 21548 Entire home/apt 2015-07-17 2015-09-04 # 5 madrid 21853 Private room 2015-07-17 2015-09-04 # 6 madrid 22736 Entire home/apt 2015-07-17 2015-09-04 # now, doing the merge between all the possible rooms # and dates and the ones that were actually observed roomDateCombosInt1 <- merge( roomDateCombos, data.frame(theScrapeData,s1=1), by.x=c("city","id","room_type","scrapeDate1"), by.y=c("city","id","room_type","scrapeDate"), all.x=TRUE ) roomDateCombosInt2 <- merge( roomDateCombosInt1, data.frame(theScrapeData,s2=1), by.x=c("city","id","room_type","scrapeDate2"), by.y=c("city","id","room_type","scrapeDate"), all.x=TRUE ) # note roomDateCombosInt2 has all the conceivable combinations # of room and scrapedate pairs. This is the data set from # which we can estimate persistence. It should look # something like this: # > roomDateCombosInt2[52:57,] # city id room_type scrapeDate2 scrapeDate1 s1 s2 # 52 madrid 30924 Private room 2015-09-04 2015-07-17 1 1 # 53 madrid 30924 Private room 2015-10-02 2015-07-17 1 1 # 54 madrid 30924 Private room 2015-10-02 2015-09-04 1 1 # 55 madrid 30959 Entire home/apt 2015-09-04 2015-07-17 1 NA # 56 madrid 30959 Entire home/apt 2015-10-02 2015-07-17 1 NA # 57 madrid 30959 Entire home/apt 2015-10-02 2015-09-04 NA NA # before we can do the calculations, we need to fix # in issue in the fields s1 and s2. # all the current values of these fields are # are 1's (the listing is present and active) or # NA (the listing is not in the scrape). # however if the first scrape date is # 1 (present) then second scrapes of NA really should # be marked as 0 (not active) roomDateCombosInt3 <- mutate( roomDateCombosInt2, s2=ifelse( s1==1 & is.na(s2), 0, s2 ) ) # the result should look something like this: # > roomDateCombosInt3[52:57,] # city id room_type scrapeDate2 scrapeDate1 s1 s2 # 52 madrid 30924 Private room 2015-09-04 2015-07-17 1 1 # 53 madrid 30924 Private room 2015-10-02 2015-07-17 1 1 # 54 madrid 30924 Private room 2015-10-02 2015-09-04 1 1 # 55 madrid 30959 Entire home/apt 2015-09-04 2015-07-17 1 0 # 56 madrid 30959 Entire home/apt 2015-10-02 2015-07-17 1 0 # 57 madrid 30959 Entire home/apt 2015-10-02 2015-09-04 NA NA ## calculating persistence (longevity) of listings ## using those scrape-date pairs # now for each city, room_type, and scrapedate pair, # i limit the observations to those where s1=1. # and sum s1 and s2. # this allows me to calculate the exponential # "decay" in the availability of room types persistenceSummaryByCityRoomTypeDatePair <- summarise( group_by( roomDateCombosInt3[which(roomDateCombosInt3$s1==1),], city, room_type, scrapeDate1, scrapeDate2 ), startnum=sum(s1), # no. rooms available on scrapeDate1 endnum=sum(s2), # no. of those available on scrapeDate2 # now, using the standard exponential decay model # on this data. # y=a(1-r)^t # where # y=amount after time t # a=initial amount # r=rate of decay # t=time units # below I will use these variable names # a, the initial amount, is by definition 1 (I am using # proportions not absolute numbers of units) # y is rawPersist, the number available on scrapeDate1 # that were also available on scrapeDate2 rawPersist=endnum/startnum, # t is intDays, the interval of days between the dates intDays=as.integer(mean(scrapeDate2)-mean(scrapeDate1)), # r is decayDay, the rate of decay per day decayDay = 1-exp((log(rawPersist))/intDays), # persistCheck should be the same as rawPersist, it # is just a math check persistCheck =(1-decayDay)**intDays, # persistYr is the amount remaining after a year persistYr =(1-decayDay)**365 ) # this data frame, # persistenceSummaryByCityRoomTypeDatePair, # contains the basic results about unit # persistence. persistYr is the projected proportion of # units remaining after a year. # it should look something like this # > head(as.data.frame(persistenceSummaryByCityRoomTypeDatePair),3) # city room_type scrapeDate1 scrapeDate2 startnum endnum rawPersist # 1 madrid Entire home/apt 2015-07-17 2015-09-04 4518 4073 0.9015051 # 2 madrid Entire home/apt 2015-07-17 2015-10-02 4518 3756 0.8313413 # 3 madrid Entire home/apt 2015-09-04 2015-10-02 4634 4219 0.9104445 # intDays decayDay persistCheck persistYr # 1 49 0.002113877 0.9015051 0.4619117 # 2 77 0.002396019 0.8313413 0.4166135 # 3 28 0.003345189 0.9104445 0.2943339 # note for each city and room type, there are as many # estimates of persistYr as there are scrape intervals. # the estimates should probably be averaged or otherwise # summarized for better reliability. ## creating datasets for the charts. # in one chart i want to show how the decay rate # we just calculated plays out over the months of # a year (each month=365/12 days) # so for each row of the results i expand out to a 12-month # span... for (i in 1:nrow(persistenceSummaryByCityRoomTypeDatePair)) { for (j in 0:12) { city <- persistenceSummaryByCityRoomTypeDatePair[i,]$city room_type <- persistenceSummaryByCityRoomTypeDatePair[i,]$room_type scrapeDate1 <- persistenceSummaryByCityRoomTypeDatePair[i,]$scrapeDate1 scrapeDate2 <- persistenceSummaryByCityRoomTypeDatePair[i,]$scrapeDate2 intDays <- persistenceSummaryByCityRoomTypeDatePair[i,]$intDays decayDay <- persistenceSummaryByCityRoomTypeDatePair[i,]$decayDay months <- j pctRem <- round(100*((1-decayDay)**(j*365/12)),1) tempDF <- data.frame( city, room_type, scrapeDate1, scrapeDate2, intDays, decayDay, months, pctRem ) if (i==1 & j==0) { persistenceGraphData <- tempDF[0,] } #close if statement persistenceGraphData <- rbind( persistenceGraphData, tempDF ) } #closes loop j } #closes loop i # here's what persistenceGraphData should look like.. # > head(persistenceGraphData) # city room_type scrapeDate1 scrapeDate2 intDays decayDay months # 1 madrid Entire home/apt 2015-07-17 2015-09-04 49 0.002113877 0 # 2 madrid Entire home/apt 2015-07-17 2015-09-04 49 0.002113877 1 # 3 madrid Entire home/apt 2015-07-17 2015-09-04 49 0.002113877 2 # 4 madrid Entire home/apt 2015-07-17 2015-09-04 49 0.002113877 3 # 5 madrid Entire home/apt 2015-07-17 2015-09-04 49 0.002113877 4 # 6 madrid Entire home/apt 2015-07-17 2015-09-04 49 0.002113877 5 # pctRem # 1 100.0 # 2 93.8 # 3 87.9 # 4 82.4 # 5 77.3 # 6 72.5 # i also want to make a graph of the growth in airbnb numbers. # for each city, room_type, and scrape date, I want to show # the number of units. However, I want them to be colored by # their first date of observation. # creating the data for that graph theGrowthGraphScrapes <- merge( theScrapeData, theScrapeRooms, by=c("city","id","room_type"), all.y=TRUE ) theGrowthGraphFirstScrapes <- summarise( group_by(theGrowthGraphScrapes,city,id,room_type), firstScrape=min(scrapeDate) ) theGrowthGraphDetailData <- merge( theGrowthGraphScrapes, theGrowthGraphFirstScrapes, by=c("city","id","room_type"), all.x=TRUE ) theGrowthGraphSummaryData <- summarise( group_by( theGrowthGraphDetailData, city, room_type, firstScrape, scrapeDate ), noUnits = n() ) %>% arrange( city, room_type, desc(firstScrape), scrapeDate ) # here's what theGrowthGraphSummaryData should look like... # > head(theGrowthGraphSummaryData) # city room_type firstScrape scrapeDate noUnits # 1 madrid Entire home/apt 2015-07-17 2015-07-17 4518 # 2 madrid Entire home/apt 2015-07-17 2015-09-04 4073 # 3 madrid Entire home/apt 2015-07-17 2015-10-02 3756 # 4 madrid Entire home/apt 2015-09-04 2015-09-04 561 # 5 madrid Entire home/apt 2015-09-04 2015-10-02 490 # 6 madrid Entire home/apt 2015-10-02 2015-10-02 324 ## defining and printing the charts # for each city, drawing two graphs, # one about growth by "cohort" and the next about # decay in the # of listings for (n in currentCities) { # first drawing the growth bar graph # with bars colored by initial dates png( filename=paste( "charts", paste(n,"-growth.png",sep=""), sep="/" ), width=555, height=400 ) #defines a graphic device to print into print( ggplot() + geom_bar( data=theGrowthGraphSummaryData[ which(theGrowthGraphSummaryData$city==n), ], aes( x=scrapeDate, y=noUnits, colour=factor(firstScrape) , fill=factor(firstScrape) ), stat="identity", position="stack", alpha=0.7 ) + geom_text( data=data.frame( room_type=c("Shared room","Shared room"), thingToPrint=c( "data source:\ninsideairbnb.com", "analysis:\nmartinjohnbrown.net"), xpz=c(as.Date("2015-12-31","2015-12-31")), ypz=c( max( theGrowthGraphSummaryData[ which( theGrowthGraphSummaryData$city==n ) ,] $noUnits )/2, max( theGrowthGraphSummaryData[ which( theGrowthGraphSummaryData$city==n ) ,] $noUnits )/4 ) ), aes(label=thingToPrint,x=xpz,y=ypz), fontface="italic", hjust="right", lineheight=0.8, nudge_x=0.5, colour="gray40" ) + facet_grid(facets = .~room_type) + theme( legend.justification=c(1,1), legend.position=c(0.98,0.98), legend.key.size=unit(1.3,"line"), axis.title.x=element_text( face="italic", size=14, color="gray50"), axis.title.y=element_text( face="italic", size=14, color="gray50"), axis.text.x=element_text( angle=-90, hjust=1, vjust = 0.2, face="bold", size=10, color="gray50"), axis.text.y=element_text( face="bold", size=10, color="gray50"), plot.title=element_text(size=16,face="bold") ) + scale_x_date(name="Scrape Date", date_labels="%m/%y") + scale_y_continuous(name="Number of airbnb listings") + scale_colour_discrete(name="Date first observed") + scale_fill_discrete(name="Date first observed") + ggtitle( paste(n, ":\ngrowth and persistence of Airbnb listings", sep="" ) ) #closes ggtitle # end of ggplot command sequence for this chart ) # closes print statement for this chart dev.off() # closes print device for this chart # now, drawing longevity decay curves png( filename=paste( "charts", paste(n,"-persistence.png",sep=""), sep="/" ), width=555, height=400 ) #defines a graphic device to print into print( ggplot() + geom_point( data=persistenceGraphData[ which(persistenceGraphData$city==n), ], aes( x=jitter(months), y=jitter(pctRem), colour=room_type, shape=room_type ), size=2, stroke=1.2, alpha=0.7 ) + geom_line( data=summarise( group_by( persistenceGraphData[ which(persistenceGraphData$city==n), ], months, room_type ), pctRem=mean(pctRem) ), aes(x=months,y=pctRem, colour=room_type), size=2, alpha=0.7 ) + geom_text( data=data.frame( thingToPrint=c( "data source:\ninsideairbnb.com", "analysis:\nmartinjohnbrown.net"), xpz=c(-1,-1), ypz=c(15,4) ), aes(label=thingToPrint,x=xpz,y=ypz), fontface="italic", hjust="left", lineheight=0.8, nudge_x=0, colour="gray40" ) + scale_x_continuous( name="Months from first appearance", breaks=c(0,3,6,9,12) ) + scale_y_continuous( name="Percentage of listings still active", breaks=seq(0,100,20) ) + expand_limits(y=c(0,100)) + scale_shape_manual(values=c(1,3,4)) + theme( legend.justification=c(1,1), legend.position=c(0.98,0.98), legend.key.size=unit(1.3,"line"), plot.title=element_text(size=16,face="bold"), axis.title.x=element_text( face="italic", size=14, color="gray50" ), axis.title.y=element_text( face="italic", size=14, color="gray50"), axis.text.y=element_text( face="bold", size=10, color="gray50"), axis.text.x=element_text( face="bold", size=10, color="gray50") ) + ggtitle( paste( n, ":\nmodeled longevity of airbnb listings", sep="" ) ) # closes ggtitle and ggplot command for this graph ) # closes print statement for this graph dev.off() # closes graphic device for this graph } #closes for statement n, related to city # calculating and some summary statistics # which I'll need for graphs comparing cities persistenceMeanByCityRoomType <- summarise( group_by( persistenceSummaryByCityRoomTypeDatePair, city, room_type ), meanPersistYr = mean(persistYr) ) # entering some city populations from wikipedia cityPops <- data.frame( city=c( "amsterdam", "london", "los-angeles", "madrid", "melbourne", "new-orleans", "portland", "san-francisco", "sydney" ), urbanPop= c( #urban area population from wikipedia 1335115, 9787426, 12150996, 6183000, 3707530, 389617, 1849898, 864816, 3908642 ), maPop= c( #metropolitan area population from wikipedia 2431000, 13879757, 13131431, 6489162, 4529496, 1262888, 2389228, 4656132, 4920970 ) ) # adding up the rooms in each city and including population airBnbUnitCounts <- merge( summarize( group_by( theGrowthGraphSummaryData, city, room_type, scrapeDate ), noUnits=sum(noUnits) ) , cityPops, by="city" ) mostRecentScrape <- summarise( group_by( airBnbUnitCounts, city, room_type ), latestScrape =max(scrapeDate) ) airBnbUnitCountsLatest <- merge( airBnbUnitCounts, mostRecentScrape, by.x=c("city","room_type","scrapeDate"), by.y=c("city","room_type","latestScrape"), all.y=TRUE ) airBnbUnitCountsLatestWithRatios <- mutate( airBnbUnitCountsLatest, unitsPerUrbanPop=noUnits/urbanPop, unitsPerMAPop=noUnits/maPop, urbanPopPerUnit=round(urbanPop/noUnits), MAPopPerUnit=round(maPop/noUnits) ) # i want to make a bar chart that shows the number # of "entire home" listings per city on the most # recent scrape. i would also like to note the scrape # date on the graph somehow. png( filename=paste( "charts", paste("recent-unit-totals-in-9-cities.png",sep=""), sep="/" ), width=555, height=400 ) #defines a graphic device to print into ggplot( data=filter( airBnbUnitCountsLatestWithRatios, room_type=="Entire home/apt" ) ) + geom_bar( aes(x=city,y=noUnits,colour=city,fill=city), stat="identity", alpha=0.7 ) + geom_text( aes(label=city,x=city,y=(noUnits+200)), hjust=0.5, vjust=0, color="black" ) + geom_text( aes( label=as.character(scrapeDate,"%b\n%Y"), x=city, y=(noUnits/2) ), angle=90, color="gray50", hjust="center", fontface="italic" ) + geom_text( data=data.frame( thingToPrint=c( "data source:\ninsideairbnb.com", "analysis:\nmartinjohnbrown.net"), xpz=c("sydney","sydney"), ypz=c(17500,16000) ), aes(label=thingToPrint,x=xpz,y=ypz), fontface="italic", hjust="right", nudge_x=0.5, lineheight=0.8, colour="gray40" ) + scale_y_continuous( name="Active 'entire home' listings" ) + theme( axis.text.x=element_blank(), axis.text.y=element_text( face="bold", size=10, color="gray50"), axis.title.x=element_blank(), axis.title.y=element_text( face="italic", size=14, color="gray50"), legend.position="none", plot.title=element_text(size=16,face="bold") ) + ggtitle("Quantity of 'entire home' Airbnb listings in 9 global cities") dev.off() # turns off graphic device # i want to make a bar chart that shows the number # of "entire home" listings per city on the most # recent scrape PER 1000 URBAN POPULATION. # i would also like to note the scrape # date on the graph somehow. png( filename=paste( "charts", paste("units-per-1000-urban-pop.png",sep=""), sep="/" ), width=555, height=400 ) #defines a graphic device to print into ggplot( data=filter( airBnbUnitCountsLatestWithRatios, room_type=="Entire home/apt" ) ) + geom_bar( aes( x=city, y=(unitsPerUrbanPop*1000), colour=city, fill=city ), stat="identity", alpha=0.7 ) + geom_text( aes(label=city,x=city,y=(unitsPerUrbanPop*1000+0.1)), hjust=0.5, vjust=0, color="black" ) + geom_text( aes( label=as.character(scrapeDate,"%b\n%Y"), x=city, y=(unitsPerUrbanPop*1000/2) ), angle=90, color="gray50", hjust="center", fontface="italic" ) + geom_text( data=data.frame( thingToPrint=c( "data source:\ninsideairbnb.com", "analysis:\nmartinjohnbrown.net"), xpz=c("sydney","sydney"), ypz=c(7.0,6.3) ), aes(label=thingToPrint,x=xpz,y=ypz), fontface="italic", hjust="right", lineheight=0.8, nudge_x=0.5, colour="gray40" ) + scale_y_continuous( name="Number of 'entire home' AirBnb listings\nper 1000 urban population" ) + theme( axis.text.x=element_blank(), axis.text.y=element_text( face="bold", size=10, color="gray50"), axis.title.x=element_blank(), axis.title.y=element_text( face="italic", size=14, color="gray50"), legend.position="none", plot.title=element_text(size=16,face="bold") ) + ggtitle("'Entire home' AirBnb listings per urban population") dev.off() # turns off graphic device # i want to make a bar chart that shows the number # of "entire home" listings per city on the most # recent scrape PER 1000 METROPOLITAN POPULATION. # i would also like to note the scrape # date on the graph somehow. png( filename=paste( "charts", paste("units-per-1000-metro-pop.png",sep=""), sep="/" ), width=555, height=400 ) #defines a graphic device to print into ggplot( data=filter( airBnbUnitCountsLatestWithRatios, room_type=="Entire home/apt" ) ) + geom_bar( aes( x=city, y=(unitsPerMAPop*1000), colour=city, fill=city ), stat="identity", alpha=0.7 ) + geom_text( aes(label=city,x=city,y=(unitsPerMAPop*1000+0.05)), hjust=0.5, vjust=0, color="black" ) + geom_text( aes( label=as.character(scrapeDate,"%b\n%Y"), x=city, y=(unitsPerMAPop*1000/2) ), angle=90, color="gray50", hjust="center", fontface="italic" ) + geom_text( data=data.frame( thingToPrint=c( "data source:\ninsideairbnb.com", "analysis:\nmartinjohnbrown.net"), xpz=c("sydney","sydney"), ypz=c(3.7,3.4) ), aes(label=thingToPrint,x=xpz,y=ypz), fontface="italic", hjust="right", lineheight=0.8, nudge_x=0.5, colour="gray40" ) + scale_y_continuous( name="Number of 'entire home' AirBnb listings\nper 1000 metropolitan area population" ) + theme( axis.text.x=element_blank(), axis.text.y=element_text( face="bold", size=10, color="gray50"), axis.title.x=element_blank(), axis.title.y=element_text( face="italic", size=14, color="gray50"), legend.position="none", plot.title=element_text(size=16,face="bold") ) + ggtitle("'Entire home' AirBnb listings per metro population") dev.off() # turns off graphic device # i want to make a bar chart that shows the modelled # persistence of "entire home" listings per city. png( filename=paste( "charts", paste("modelled-persistence-in-9-cities.png",sep=""), sep="/" ), width=555, height=400 ) #defines a graphic device to print into ggplot( ) + geom_jitter( data=filter( persistenceSummaryByCityRoomTypeDatePair, room_type=="Entire home/apt" ), aes( x=city, y=persistYr*100, colour=city, fill=city ) ) + geom_bar( data=filter( persistenceMeanByCityRoomType, room_type=="Entire home/apt" ), aes( x=city, y=meanPersistYr*100, colour=city, fill=city ), stat="identity", alpha=0.4 ) + geom_text( data=filter( persistenceMeanByCityRoomType, room_type=="Entire home/apt" ), aes(label=city,x=city,y=(meanPersistYr*100+1)), hjust=0.5, vjust=0, color="black" ) + geom_text( data=data.frame( thingToPrint=c( "data source:\ninsideairbnb.com", "analysis:\nmartinjohnbrown.net"), xpz=c("sydney","sydney"), ypz=c(97,90) ), aes(label=thingToPrint,x=xpz,y=ypz), fontface="italic", hjust="right", lineheight=0.8, nudge_x=0.5, colour="gray40" ) + scale_y_continuous( name="Modeled percentage of 'entire home' AirBnb listings \nstill active after 1 year" ) + expand_limits(y=c(0,100))+ theme( axis.text.x=element_blank(), axis.text.y=element_text( face="bold", size=10, color="gray50"), axis.title.x=element_blank(), axis.title.y=element_text( face="italic", size=14, color="gray50"), legend.position="none", plot.title=element_text(size=16,face="bold") ) + ggtitle("Modeling based on repeated 'scrapes' suggests\nmany AirBnb listings are active less than 1 year") dev.off() # turns off graphic device # i want to make a chart that shows growth in # airbnb in all cities on a common scale -- # the number of units observed on the first # scrape. # don't know why the chain operator isn't working for # me right now. so i am going to do the following really # inefficiently. tempDF <- as.data.frame( filter( theGrowthGraphSummaryData, room_type=="Entire home/apt" ) ) tempDF <- arrange( tempDF, city, room_type, scrapeDate, firstScrape ) tempDF <- summarise( group_by( tempDF, city, room_type, scrapeDate ), totUnits = sum(noUnits) ) tempDF2 <- summarise( group_by(tempDF,city,room_type), firstDate=first(scrapeDate), firstCount=first(totUnits) ) entireHomesGrowthSummary <- merge( as.data.frame( filter( theGrowthGraphSummaryData, room_type=="Entire home/apt" ) ), as.data.frame(tempDF2), by.x=c("city","room_type"), by.y=c("city","room_type"), all.x=TRUE ) %>% mutate( pctOfFirst=noUnits/firstCount ) entireHomesGrowthSummaryPlus <- rbind( data.frame( entireHomesGrowthSummary, thingy="growth in total number of 'entire home' listings" ), data.frame( filter( entireHomesGrowthSummary, firstScrape==firstDate ), thingy="decline among listings active on first scrape" ) ) entireHomesGrowthSummaryPlusSummed <- summarise( group_by( entireHomesGrowthSummaryPlus, thingy, city, room_type, scrapeDate ), totUnits=sum(noUnits), firstUnits=mean(firstCount) ) %>% mutate( pctUnits=100*totUnits/firstUnits ) # open a graphic device png( filename=paste( "charts", paste("growth-in-9-cities.png",sep=""), sep="/" ), width=555, height=400 ) #defines a graphic device to print into ggplot()+ geom_point( data=entireHomesGrowthSummaryPlusSummed, aes( x=scrapeDate, y=pctUnits, colour=city, fill=city ), size=3, alpha=0.6 ) + geom_line( data=entireHomesGrowthSummaryPlusSummed, aes( x=scrapeDate, y=pctUnits, colour=city, fill=city ), size=1, alpha=0.6 ) + geom_text( data=data.frame( thingy=c( "decline among listings active on first scrape", "decline among listings active on first scrape" ), thingToPrint=c( "data source:\ninsideairbnb.com", "analysis:\nmartinjohnbrown.net"), xpz=as.Date(c("2016-04-30","2016-04-30")), ypz=c(157,145) ), aes(label=thingToPrint,x=xpz,y=ypz), fontface="italic", hjust="right", lineheight=0.8, nudge_x=0.5, color="gray40" ) + facet_grid(facets=.~thingy) + ggtitle("Growth and decline in AirBnb listings in nine global cities") + guides( fill=guide_legend(ncol=2), color=guide_legend(ncol=2) ) + scale_x_date( name="Scrape Date", date_labels="%m/%y" ) + scale_y_continuous( name="Number of 'entire home' Airbnb listings\n(as percentage of initial count)" ) + theme( axis.text.x=element_text( # angle=90, # hjust=0, # vjust = 0.2, face="bold", size=10, color="gray50"), axis.text.y=element_text( face="bold", size=10, color="gray50"), axis.title.x=element_text( face="italic", size=14, color="gray50"), axis.title.y=element_text( face="italic", size=14, color="gray50"), legend.justification=c(0,0), legend.key.size=unit(0.5,"line"), legend.position=c(0.09,0.03), legend.title.align=0.5, plot.title=element_text(size=16,face="bold"), strip.text.x=element_text(size=10,face="bold.italic") ) # end of ggplot command sequence for this chart dev.off() # closes graphic device