OLX Car Price Scraper

Load libraries

library(magrittr)
library(tidyverse)
library(rvest)
library(ggvis)
library(glue)
library(ggridges)
library(xlsx)
library(fs)
library(tictoc)
source("olx_scraper_utils.R")

To refresh scrape must delete both .rds below from disk

fname_advert <- "advert_df_pp.rds"
advert_exists <- file_exists(fname_advert)

Scraping

Inspired by this article

Construct DF with brand, model, pages, and subpages

urls_1_df <- tribble(
  ~brand, ~model, 
  "honda", "civic",
  "toyota", "corolla",
  "nissan", "sentra",
  "vw-volkswagen", "jetta",
  "renault", "fluence") %>%
  mutate(url=olxURL(brand,model),
         subpages=map(url, ~.x %>%
                        read_html() %>%
                        getPageNumbers))

Cleans the “integer(0)” when there’s only the 1st page, unnest to make a separate row for each subpage, and compute url’s of subpages:

urls_complete <- urls_1_df %>%
  mutate(subpages=map(subpages,
                      ~(if(length(.x)==0) 1L else .x))) %>%             
  unnest(subpages) %>%
  mutate(url_subpage=olxURL_page(brand,model,subpages))

Get pages, slow hits webserver

tic()
urls_complete %<>% mutate(page=get_html_pages(urls_complete$url_subpage))
toc()
#> 119.842 sec elapsed

Build dataframe for links, title, prices for all ads

advert_df0 <- urls_complete %>%
  mutate(link=get_html_href(page,".OLXad-list-link"),
         title=get_html_text(page,".OLXad-list-title"),
         price=get_html_text(page,".OLXad-list-price") %>% map(convertReais),
         region=get_html_text(page,".detail-region") %>%
           map(~str_replace_all(.x,"[\\s\\n\\t]+"," ")),
         kcm=get_html_text(page,".detail-specific") %>%
           map(decodeKmCambioMotor))

# does this to adjust all list-columns to a max number of items
# which is equal to the min of all link,title,prince,region,kcm
max_items0 <- advert_df0%>%
  select_if(is.list)%>%
  map(~map_int(.x,length))%>%
  as_tibble%>%
  mutate(max_items=pmap_int(list(link,title,price,region,kcm),
                            ~min(..1,..2,..3,..4,..5)))

advert_df <- advert_df0 %>%
  mutate_at(vars(link,title,price,region,kcm),
            ~map2(.,max_items0$max_items,~head(.x,.y))) %>%
  unnest(link,title,price,region,kcm,.preserve=page) %>%
  separate(kcm,into=c("km","cambio","motor"),"\\|",convert=T)

Post process advert_df

advert_df_pp <- advert_df %>%
  transmute(brand=as.factor(brand), model=as.factor(model),
            year=title %>% str_sub(start=-4) %>% as.integer() %>% as.factor(),
            title=str_sub(title,end=-7),
            ddd=str_sub(region,start=-2),
            region=str_replace(region," - DDD \\d\\d",""),
            price1k=round(price/1000,1),
            km1k=round(km/1000,1),
            cambio=as.factor(cambio),
            motor=as.factor(motor),
            link)

Recursively get car description from linked page. Slow (hits webserver many times)

if(!advert_exists) advert_df_pp %<>%
  mutate(pages_descr=get_html_pages(link))

Montando novo DF com descrições (seguindo link do anuncio), note uso da função get_html_text_collapse() para não ignorar
:

if(!advert_exists)
  advert_df_pp %<>%
  mutate(descr=unlist(get_html_text_collapse(pages_descr,
                                             ".OLXad-description")),
         model_descr=unlist(get_html_text(pages_descr,".model")))

if(!advert_exists)
  write_rds(advert_df_pp,fname_advert,compress = "bz")

Limpeza da descrição e modelo

A partir daqui só precisa de advert_df_pp

advert_df_pp <- read_rds(fname_advert)

advert_df_pp_clean <- advert_df_pp %>%
  mutate(key=row_number(),
         descr=descr %>% str_to_lower() %>% str_replace_all("[^[:alnum:]/]"," ") %>%
           str_replace_all("\\s{2,}"," "),
         model_descr=str_replace(model_descr,"Modelo:[\\n\\t]+","") %>%
           str_replace("AUTOM[AÁ]TICO","AUT.") %>%
           str_replace("(SED.|SEDAN|UPPER) ","") %>%
           str_replace("FLEX\\s?(START|FUEL|ONE)","FLEX"))

Reporta preço mediano e N por modelo. Parece q modelos são padronizados tipo FIPE

advert_df_pp_clean_counts <- advert_df_pp_clean %>%
  group_by(model) %>%
  summarize(N=n(),price1k_median=median(price1k)) %>%
  arrange(desc(price1k_median)) %>%
  ungroup() %T>% print
#> # A tibble: 5 x 3
#>   model       N price1k_median
#>   <fct>   <int>          <dbl>
#> 1 corolla   251           72.9
#> 2 jetta     390           72  
#> 3 civic     596           65.9
#> 4 sentra    147           62.8
#> 5 fluence   218           52.9

advert_df_pp_clean_counts <- advert_df_pp_clean %>%
  #mutate(model=compressModel(model) %>% str_replace_all("\n"," ")) %>%
  group_by(brand,model) %>%
  summarize(N=n(),price1k_median=median(price1k)) %>%
  arrange(desc(price1k_median)) %>%
  ungroup() %T>% print
#> # A tibble: 5 x 4
#>   brand         model       N price1k_median
#>   <fct>         <fct>   <int>          <dbl>
#> 1 toyota        corolla   251           72.9
#> 2 vw-volkswagen jetta     390           72  
#> 3 honda         civic     596           65.9
#> 4 nissan        sentra    147           62.8
#> 5 renault       fluence   218           52.9

Modelos mais frequentes

advert_df_pp_clean %>%
  mutate(model = model %>% fct_infreq() %>% fct_lump(5)) %>%
  ggplot(aes(x=model,fill=model)) +
  #geom_bar(stat="count") +
  geom_bar(aes(y = (..count..)/sum(..count..))) + 
  scale_y_continuous(labels=scales::percent) +
  labs(y="%",title="Modelos mais frequentes")+
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "none")

Mostrar seis modelos com mediana de preço mais alta

fun_length_y <- function(x) data.frame(y=median(x),label= paste0("N=", length(x)))

advert_df_pp_clean %>%
  mutate(model = model %>% fct_lump(8) %>%
           fct_reorder(price1k,.desc=T)) %>%
  filter(model!="Other") %>%
  ggplot(aes(x=model,y=price1k,fill=model)) +
  #geom_bar(stat="count") +
  geom_boxplot(width=.5,notch=T) +
  stat_summary(fun.data = fun_length_y,
               geom = "text", vjust = .5,hjust=-1, size = 3) +
  geom_jitter(color="black",alpha=.1,width=.1,size=2) +
  theme_bw() +
  coord_cartesian(ylim=c(50,80)) +
  labs(title="Modelos com mediana mais cara")+
  theme(axis.text.x = element_text(angle = 30, hjust = 1),
        legend.position = "none")

ggsave("most_expensive.png",width=10,height=5,units="in")

advert_df_pp_clean %>%
  filter(!is.na(km1k),!is.na(price1k)) %>%
  filter(year==2016) %>%
  ggplot(aes(x=km1k,y=price1k,color=model)) +
  geom_point() +
  geom_smooth(size=2) +
  theme_bw()

advert_df_pp_clean %>%
  filter(year==2016,!is.na(km1k),
         model %in% c("civic","corolla","sentra")) %>%
  mutate(km1k=km1k %>% cut_width(10,boundary=0)) %>%
  group_by(model,km1k) %>%
  summarize(price1k_mad=mad(price1k),
            price1k=median(price1k) # must be 2nd as prev result needs it
            ) %>%
  ungroup() %>%
  mutate(model=model%>%fct_reorder(price1k,.desc=T)) %>%
  ggplot(aes(x=km1k,y=price1k)) +
  geom_line(aes(group=model,color=model),size=2) +
  geom_errorbar(aes(x=km1k,ymin=price1k-price1k_mad,ymax=price1k+price1k_mad,
                    group=model,color=model),width = 0.25) +
  geom_point(aes(color=model),alpha=.5) +
  coord_cartesian(ylim=c(40,80)) +
  theme_bw()

advert_df_pp_clean %>%
  filter(year==2016,model %in% c("civic","corolla","sentra")) %>%
  mutate(km1k_range=km1k %>% cut_width(10,boundary=0),
         model=model%>%fct_reorder(price1k,.desc=T)) %>%
  ggplot(aes(x=km1k_range,y=price1k)) +
  geom_boxplot(aes(fill=model)) +
  geom_point(aes(color=model),alpha=.25) +
  coord_cartesian(ylim=c(40,80)) +
  theme_bw()

Sentiment Analysis

Para cada carro (identificado por “key”), liste as palavras usadas na descrição.

advert_df_pp_words <- advert_df_pp_clean %>%
  select(key,descr) %>%
  mutate(word=str_split(descr,"\\s+")) %>%
  select(-descr) %>%
  unnest() %>%
  filter(str_length(word)>2,
         !str_detect(word,"\\d"))

Estudo frequencial das palavras

advert_df_pp_words_counted <- advert_df_pp_words %>%
  count(word,sort=T) %T>% print
#> # A tibble: 2,349 x 2
#>    word           n
#>    <chr>      <int>
#>  1 carro       1372
#>  2 com         1298
#>  3 ver          857
#>  4 número       847
#>  5 para         529
#>  6 sem          496
#>  7 ipva         476
#>  8 dono         473
#>  9 couro        468
#> 10 automático   443
#> # … with 2,339 more rows

Salvamos em excel para manualmente marcarmos palavras com sentimento negativo

advert_df_pp_words_counted %>%
  as.data.frame() %>%
  write.xlsx("advert_df_pp_words_counted.xls", row.names=F)

Le arquivo com palavras com sentimento negativo

df_suspicious <- read_csv("suspicious words.csv") %T>% print
#> # A tibble: 36 x 1
#>    word      
#>    <chr>     
#>  1 nunca     
#>  2 entrada   
#>  3 leilão    
#>  4 seguradora
#>  5 avaria    
#>  6 bateu     
#>  7 batida    
#>  8 furto     
#>  9 nenhuma   
#> 10 parcelada 
#> # … with 26 more rows

Acha anuncios q contem palavra negativa

advert_df_pp_words_negative <- advert_df_pp_words %>%
  semi_join(df_suspicious,by="word") %>%
  group_by(key) %>%
  summarize(negativity_count=n(),
            bad_words=paste0(word,collapse=","))

Expande df de anúncios com análise de sentimento

advert_df_pp_clean_sentiment <- advert_df_pp_clean %>%
  left_join(advert_df_pp_words_negative,by="key") %>%
  mutate(negativity_count=if_else(is.na(negativity_count),0L,negativity_count)) %>%
  arrange(desc(negativity_count))
head(advert_df_pp_clean_sentiment,100)

Plots filtrados por sentimento non-suspicious

df_filt <- advert_df_pp_clean_sentiment %>%
  filter(price1k>40,!is.na(brand),year==2016) %>%
  mutate(tooltip=glue("<a href={link}>{title}\n{region}\n</a>"))

Histograma da contagem de palavras negativas por anúncio

df_filt %>%
  ggplot(aes(x=model,y=negativity_count+1,fill=model)) +
  geom_boxplot() +
  scale_y_log10() +
  ggtitle("Contagem de palavras negativas") +
  theme(legend.position = "none")

df_filt %>%
  mutate(model=fct_reorder(model,price1k,.desc=T),
         suspicious=negativity_count>1) %>%
  arrange(suspicious) %>% # so true is drawn last
  #mutate(suspicious=as.factor(suspicious) %>% fct_inorder() %>% fct_rev()) %>%
  ggplot(aes(x=model,y=price1k,fill=brand)) +
  #geom_violin(alpha=.25) +
  geom_boxplot(notch = T, show.legend = F) +
  scale_fill_discrete(guide = "none") +
  stat_summary(fun.data = fun_length_y,
               geom = "text", vjust = 1,hjust=.5, size = 4) +
  geom_jitter(aes(color=suspicious,shape=suspicious),alpha=.5,width=.1,size=3) +
  scale_shape_manual(values=c(16,17)) +
  scale_colour_manual(values = c("gray","red")) +
  theme_bw() +
  ggtitle("Sedans \"Black\", RJ, particular",
          subtitle="km < 80k, price < R$80k, year = 2016")

Ridge plot dos preços dos 3 sedans

df_filt_medians <- df_filt %>%
  group_by(model) %>%
  summarize(N=n(),
            price1k_median=median(price1k),
            price1k_mean=mean(price1k)) %>%
  arrange(desc(price1k_median)) %>%
  mutate(y=row_number())
  

df_filt %>%
  mutate(model=fct_reorder(model,price1k,.desc=T)) %>%
  ggplot(aes(price1k, model,fill=model)) +
  geom_density_ridges(alpha=.5) +
  geom_text(aes(x=90,y=y+.2,label=sprintf("N=%d",N)),
            data=df_filt_medians) +
  geom_segment(aes(x=price1k_median,y=y-.1,xend=price1k_median,yend=y+.1),
            data=df_filt_medians,color="blue",size=2) +
  geom_text(aes(x=price1k_median,y=y+.2,label=sprintf("med=%.1f",price1k_median)),
            data=df_filt_medians,color="blue") +
  geom_segment(aes(x=price1k_mean,y=y-.1,xend=price1k_mean,yend=y+.1),
               data=df_filt_medians,color="red",size=2) +
  geom_text(aes(x=price1k_mean,y=y-.2,label=sprintf("avg=%.1f",price1k_mean)),
            data=df_filt_medians,color="red") +
  theme_ridges() +
  ggtitle("Sedans \"Black\", RJ, particular",
          subtitle="km < 80k, price < R$80k, year = 2016") +
  theme(legend.position = "none")
#> Picking joint bandwidth of 1.27

Preço vs km

df_filt %>%
  filter(!is.na(km1k),!is.na(price1k)) %>%
  mutate(suspicious=negativity_count>4) %>%
  ggplot(aes(km1k,price1k,color=brand,group=brand)) +
  geom_point(aes(shape=suspicious),size=3) +
  geom_smooth() +
  theme_bw() +
  ggtitle("Sedans Japonses, RJ, particular",
          subtitle="km < 80k, price < R$80k, year >= 2016")
#> `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Mesmo plot com ggvis (tooltips seems buggy, cannot yet add title)

df_filt %>%
  ggvis(~km1k, ~price1k, fill=~brand) %>%
  group_by(brand) %>%
  layer_points() %>%
  # hangs
  # add_tooltip(function(df) "hello",on="click") %>%
  layer_smooths(stroke=~brand)

advert_df_pp_clean_sentiment %>%
  mutate(model=model %>% fct_reorder(price1k,.desc=T)) %>% 
  ggplot(aes(x=model,y=price1k,fill=model)) +
  geom_boxplot() +
  theme(legend.position = "none") +
  ggtitle("Car Prices in RJ",
          subtitle="yr = 2016, miles < 80k, price < 80k")