library(magrittr)
library(tidyverse)
library(rvest)
library(ggvis)
library(glue)
library(ggridges)
library(xlsx)
library(fs)
library(tictoc)
source("olx_scraper_utils.R")
To refresh scrape must delete both .rds below from disk
fname_advert <- "advert_df_pp.rds"
advert_exists <- file_exists(fname_advert)
Inspired by this article
Construct DF with brand, model, pages, and subpages
urls_1_df <- tribble(
~brand, ~model,
"honda", "civic",
"toyota", "corolla",
"nissan", "sentra",
"vw-volkswagen", "jetta",
"renault", "fluence") %>%
mutate(url=olxURL(brand,model),
subpages=map(url, ~.x %>%
read_html() %>%
getPageNumbers))
Cleans the “integer(0)” when there’s only the 1st page, unnest to make a separate row for each subpage, and compute url’s of subpages:
urls_complete <- urls_1_df %>%
mutate(subpages=map(subpages,
~(if(length(.x)==0) 1L else .x))) %>%
unnest(subpages) %>%
mutate(url_subpage=olxURL_page(brand,model,subpages))
Get pages, slow hits webserver
tic()
urls_complete %<>% mutate(page=get_html_pages(urls_complete$url_subpage))
toc()
#> 119.842 sec elapsed
advert_df0 <- urls_complete %>%
mutate(link=get_html_href(page,".OLXad-list-link"),
title=get_html_text(page,".OLXad-list-title"),
price=get_html_text(page,".OLXad-list-price") %>% map(convertReais),
region=get_html_text(page,".detail-region") %>%
map(~str_replace_all(.x,"[\\s\\n\\t]+"," ")),
kcm=get_html_text(page,".detail-specific") %>%
map(decodeKmCambioMotor))
# does this to adjust all list-columns to a max number of items
# which is equal to the min of all link,title,prince,region,kcm
max_items0 <- advert_df0%>%
select_if(is.list)%>%
map(~map_int(.x,length))%>%
as_tibble%>%
mutate(max_items=pmap_int(list(link,title,price,region,kcm),
~min(..1,..2,..3,..4,..5)))
advert_df <- advert_df0 %>%
mutate_at(vars(link,title,price,region,kcm),
~map2(.,max_items0$max_items,~head(.x,.y))) %>%
unnest(link,title,price,region,kcm,.preserve=page) %>%
separate(kcm,into=c("km","cambio","motor"),"\\|",convert=T)
Post process advert_df
advert_df_pp <- advert_df %>%
transmute(brand=as.factor(brand), model=as.factor(model),
year=title %>% str_sub(start=-4) %>% as.integer() %>% as.factor(),
title=str_sub(title,end=-7),
ddd=str_sub(region,start=-2),
region=str_replace(region," - DDD \\d\\d",""),
price1k=round(price/1000,1),
km1k=round(km/1000,1),
cambio=as.factor(cambio),
motor=as.factor(motor),
link)
Recursively get car description from linked page. Slow (hits webserver many times)
if(!advert_exists) advert_df_pp %<>%
mutate(pages_descr=get_html_pages(link))
Montando novo DF com descrições (seguindo link do anuncio), note uso da função get_html_text_collapse() para não ignorar
:
if(!advert_exists)
advert_df_pp %<>%
mutate(descr=unlist(get_html_text_collapse(pages_descr,
".OLXad-description")),
model_descr=unlist(get_html_text(pages_descr,".model")))
if(!advert_exists)
write_rds(advert_df_pp,fname_advert,compress = "bz")
A partir daqui só precisa de advert_df_pp
advert_df_pp <- read_rds(fname_advert)
advert_df_pp_clean <- advert_df_pp %>%
mutate(key=row_number(),
descr=descr %>% str_to_lower() %>% str_replace_all("[^[:alnum:]/]"," ") %>%
str_replace_all("\\s{2,}"," "),
model_descr=str_replace(model_descr,"Modelo:[\\n\\t]+","") %>%
str_replace("AUTOM[AÁ]TICO","AUT.") %>%
str_replace("(SED.|SEDAN|UPPER) ","") %>%
str_replace("FLEX\\s?(START|FUEL|ONE)","FLEX"))
Reporta preço mediano e N por modelo. Parece q modelos são padronizados tipo FIPE
advert_df_pp_clean_counts <- advert_df_pp_clean %>%
group_by(model) %>%
summarize(N=n(),price1k_median=median(price1k)) %>%
arrange(desc(price1k_median)) %>%
ungroup() %T>% print
#> # A tibble: 5 x 3
#> model N price1k_median
#> <fct> <int> <dbl>
#> 1 corolla 251 72.9
#> 2 jetta 390 72
#> 3 civic 596 65.9
#> 4 sentra 147 62.8
#> 5 fluence 218 52.9
advert_df_pp_clean_counts <- advert_df_pp_clean %>%
#mutate(model=compressModel(model) %>% str_replace_all("\n"," ")) %>%
group_by(brand,model) %>%
summarize(N=n(),price1k_median=median(price1k)) %>%
arrange(desc(price1k_median)) %>%
ungroup() %T>% print
#> # A tibble: 5 x 4
#> brand model N price1k_median
#> <fct> <fct> <int> <dbl>
#> 1 toyota corolla 251 72.9
#> 2 vw-volkswagen jetta 390 72
#> 3 honda civic 596 65.9
#> 4 nissan sentra 147 62.8
#> 5 renault fluence 218 52.9
Modelos mais frequentes
advert_df_pp_clean %>%
mutate(model = model %>% fct_infreq() %>% fct_lump(5)) %>%
ggplot(aes(x=model,fill=model)) +
#geom_bar(stat="count") +
geom_bar(aes(y = (..count..)/sum(..count..))) +
scale_y_continuous(labels=scales::percent) +
labs(y="%",title="Modelos mais frequentes")+
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "none")
Mostrar seis modelos com mediana de preço mais alta
fun_length_y <- function(x) data.frame(y=median(x),label= paste0("N=", length(x)))
advert_df_pp_clean %>%
mutate(model = model %>% fct_lump(8) %>%
fct_reorder(price1k,.desc=T)) %>%
filter(model!="Other") %>%
ggplot(aes(x=model,y=price1k,fill=model)) +
#geom_bar(stat="count") +
geom_boxplot(width=.5,notch=T) +
stat_summary(fun.data = fun_length_y,
geom = "text", vjust = .5,hjust=-1, size = 3) +
geom_jitter(color="black",alpha=.1,width=.1,size=2) +
theme_bw() +
coord_cartesian(ylim=c(50,80)) +
labs(title="Modelos com mediana mais cara")+
theme(axis.text.x = element_text(angle = 30, hjust = 1),
legend.position = "none")
ggsave("most_expensive.png",width=10,height=5,units="in")
advert_df_pp_clean %>%
filter(!is.na(km1k),!is.na(price1k)) %>%
filter(year==2016) %>%
ggplot(aes(x=km1k,y=price1k,color=model)) +
geom_point() +
geom_smooth(size=2) +
theme_bw()
advert_df_pp_clean %>%
filter(year==2016,!is.na(km1k),
model %in% c("civic","corolla","sentra")) %>%
mutate(km1k=km1k %>% cut_width(10,boundary=0)) %>%
group_by(model,km1k) %>%
summarize(price1k_mad=mad(price1k),
price1k=median(price1k) # must be 2nd as prev result needs it
) %>%
ungroup() %>%
mutate(model=model%>%fct_reorder(price1k,.desc=T)) %>%
ggplot(aes(x=km1k,y=price1k)) +
geom_line(aes(group=model,color=model),size=2) +
geom_errorbar(aes(x=km1k,ymin=price1k-price1k_mad,ymax=price1k+price1k_mad,
group=model,color=model),width = 0.25) +
geom_point(aes(color=model),alpha=.5) +
coord_cartesian(ylim=c(40,80)) +
theme_bw()
advert_df_pp_clean %>%
filter(year==2016,model %in% c("civic","corolla","sentra")) %>%
mutate(km1k_range=km1k %>% cut_width(10,boundary=0),
model=model%>%fct_reorder(price1k,.desc=T)) %>%
ggplot(aes(x=km1k_range,y=price1k)) +
geom_boxplot(aes(fill=model)) +
geom_point(aes(color=model),alpha=.25) +
coord_cartesian(ylim=c(40,80)) +
theme_bw()
Para cada carro (identificado por “key”), liste as palavras usadas na descrição.
advert_df_pp_words <- advert_df_pp_clean %>%
select(key,descr) %>%
mutate(word=str_split(descr,"\\s+")) %>%
select(-descr) %>%
unnest() %>%
filter(str_length(word)>2,
!str_detect(word,"\\d"))
Estudo frequencial das palavras
advert_df_pp_words_counted <- advert_df_pp_words %>%
count(word,sort=T) %T>% print
#> # A tibble: 2,349 x 2
#> word n
#> <chr> <int>
#> 1 carro 1372
#> 2 com 1298
#> 3 ver 857
#> 4 número 847
#> 5 para 529
#> 6 sem 496
#> 7 ipva 476
#> 8 dono 473
#> 9 couro 468
#> 10 automático 443
#> # … with 2,339 more rows
Salvamos em excel para manualmente marcarmos palavras com sentimento negativo
advert_df_pp_words_counted %>%
as.data.frame() %>%
write.xlsx("advert_df_pp_words_counted.xls", row.names=F)
Le arquivo com palavras com sentimento negativo
df_suspicious <- read_csv("suspicious words.csv") %T>% print
#> # A tibble: 36 x 1
#> word
#> <chr>
#> 1 nunca
#> 2 entrada
#> 3 leilão
#> 4 seguradora
#> 5 avaria
#> 6 bateu
#> 7 batida
#> 8 furto
#> 9 nenhuma
#> 10 parcelada
#> # … with 26 more rows
Acha anuncios q contem palavra negativa
advert_df_pp_words_negative <- advert_df_pp_words %>%
semi_join(df_suspicious,by="word") %>%
group_by(key) %>%
summarize(negativity_count=n(),
bad_words=paste0(word,collapse=","))
Expande df de anúncios com análise de sentimento
advert_df_pp_clean_sentiment <- advert_df_pp_clean %>%
left_join(advert_df_pp_words_negative,by="key") %>%
mutate(negativity_count=if_else(is.na(negativity_count),0L,negativity_count)) %>%
arrange(desc(negativity_count))
head(advert_df_pp_clean_sentiment,100)
df_filt <- advert_df_pp_clean_sentiment %>%
filter(price1k>40,!is.na(brand),year==2016) %>%
mutate(tooltip=glue("<a href={link}>{title}\n{region}\n</a>"))
Histograma da contagem de palavras negativas por anúncio
df_filt %>%
ggplot(aes(x=model,y=negativity_count+1,fill=model)) +
geom_boxplot() +
scale_y_log10() +
ggtitle("Contagem de palavras negativas") +
theme(legend.position = "none")
df_filt %>%
mutate(model=fct_reorder(model,price1k,.desc=T),
suspicious=negativity_count>1) %>%
arrange(suspicious) %>% # so true is drawn last
#mutate(suspicious=as.factor(suspicious) %>% fct_inorder() %>% fct_rev()) %>%
ggplot(aes(x=model,y=price1k,fill=brand)) +
#geom_violin(alpha=.25) +
geom_boxplot(notch = T, show.legend = F) +
scale_fill_discrete(guide = "none") +
stat_summary(fun.data = fun_length_y,
geom = "text", vjust = 1,hjust=.5, size = 4) +
geom_jitter(aes(color=suspicious,shape=suspicious),alpha=.5,width=.1,size=3) +
scale_shape_manual(values=c(16,17)) +
scale_colour_manual(values = c("gray","red")) +
theme_bw() +
ggtitle("Sedans \"Black\", RJ, particular",
subtitle="km < 80k, price < R$80k, year = 2016")
Ridge plot dos preços dos 3 sedans
df_filt_medians <- df_filt %>%
group_by(model) %>%
summarize(N=n(),
price1k_median=median(price1k),
price1k_mean=mean(price1k)) %>%
arrange(desc(price1k_median)) %>%
mutate(y=row_number())
df_filt %>%
mutate(model=fct_reorder(model,price1k,.desc=T)) %>%
ggplot(aes(price1k, model,fill=model)) +
geom_density_ridges(alpha=.5) +
geom_text(aes(x=90,y=y+.2,label=sprintf("N=%d",N)),
data=df_filt_medians) +
geom_segment(aes(x=price1k_median,y=y-.1,xend=price1k_median,yend=y+.1),
data=df_filt_medians,color="blue",size=2) +
geom_text(aes(x=price1k_median,y=y+.2,label=sprintf("med=%.1f",price1k_median)),
data=df_filt_medians,color="blue") +
geom_segment(aes(x=price1k_mean,y=y-.1,xend=price1k_mean,yend=y+.1),
data=df_filt_medians,color="red",size=2) +
geom_text(aes(x=price1k_mean,y=y-.2,label=sprintf("avg=%.1f",price1k_mean)),
data=df_filt_medians,color="red") +
theme_ridges() +
ggtitle("Sedans \"Black\", RJ, particular",
subtitle="km < 80k, price < R$80k, year = 2016") +
theme(legend.position = "none")
#> Picking joint bandwidth of 1.27
Preço vs km
df_filt %>%
filter(!is.na(km1k),!is.na(price1k)) %>%
mutate(suspicious=negativity_count>4) %>%
ggplot(aes(km1k,price1k,color=brand,group=brand)) +
geom_point(aes(shape=suspicious),size=3) +
geom_smooth() +
theme_bw() +
ggtitle("Sedans Japonses, RJ, particular",
subtitle="km < 80k, price < R$80k, year >= 2016")
#> `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Mesmo plot com ggvis (tooltips seems buggy, cannot yet add title)
df_filt %>%
ggvis(~km1k, ~price1k, fill=~brand) %>%
group_by(brand) %>%
layer_points() %>%
# hangs
# add_tooltip(function(df) "hello",on="click") %>%
layer_smooths(stroke=~brand)
advert_df_pp_clean_sentiment %>%
mutate(model=model %>% fct_reorder(price1k,.desc=T)) %>%
ggplot(aes(x=model,y=price1k,fill=model)) +
geom_boxplot() +
theme(legend.position = "none") +
ggtitle("Car Prices in RJ",
subtitle="yr = 2016, miles < 80k, price < 80k")