A simple guide to K-Means in R
Using Kaggle.com Amazon Bestsellers Dataset
Analysis of the amazon bestsellers and their statistics using various R packages.
For this project I used Quarto instead of Rmarkdown.
Notebook available at:
Amazon Bestsellers
Loading libraries
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(skimr))
suppressPackageStartupMessages(library(corrplot))
suppressPackageStartupMessages(library(RColorBrewer))
Loading Data
bestsellers <- read.csv(“bestsellers with categories.csv”)
head(bestsellers, 2)
str(bestsellers)
```
Exploring the Data
Column Names
bestsellers %>% colnames()
Checking for missing Values
any(is.na(bestsellers))
Data at a glance
unique(bestsellers$Year)
unique(bestsellers$Genre)
Data Distribution
skim(bestsellers)
## all data formats look good, no conversion needed for now
Data Visualization
# Genres
## Bestsellers distribution by genre
bestsellers %>%
group_by(Genre) %>%
summarise(books = n()) %>%
ggplot(aes(x = “”, y = books, fill = Genre)) +
geom_bar(stat = “identity”,
width = 1,
color = “white”) +
coord_polar(“y”, start = 0) +
theme_void() +
scale_fill_manual(values = c(“#ff9900”, “#000000”))
Genres through the years
## Bestsellers distribution by genre by year
bestsellers %>%
group_by(Year, Genre) %>%
summarise(sum = n()) %>%
ggplot(aes(
x = Year,
y = sum,
fill = Genre
)) +
geom_col() +
theme_minimal() +
scale_x_continuous(breaks = seq(2009, 2019, by = 1)) +
scale_fill_manual(values = c(“#ff9900”, “#000000”)) +
theme(
panel.grid.major.x = element_blank(),
panel.grid.minor = element_blank(),
axis.title.y = element_blank(),
legend.title = element_blank(),
axis.title.x = element_blank()
)
Correlation Analysis
## Is there any correlation seen between the different numerical variables?
bestsellers %>%
select(User.Rating, Reviews, Price, Year) %>%
cor() %>%
corrplot::corrplot(
type = “lower”,
order = “hclust”,
method = “color”,
addgrid.col = “darkgray”,
outline = T,
tl.cex = 1,
tl.col = “black”,
col = brewer.pal(n = 6, name = “RdGy”)
)
```
# User Reviews vs Year
## Most reviewed books
bestsellers %>%
select(Name, Reviews) %>%
arrange(desc(Reviews)) %>%
head(20) %>%
distinct()
```
## Are the number of reviews changing by year?
### Linear Model
bestsellers %>%
lm(formula = Year ~ Reviews) %>%
summary()
```
### Data Viz
bestsellers %>%
ggplot(aes(Year, Reviews, color = Genre))+
geom_jitter()+
geom_smooth(method = ‘lm’, formula = y ~ x )+
theme_minimal() +
scale_color_manual(values = c(“#ff9900”, “#000000”)) +
scale_x_continuous(breaks = seq(2009, 2019, by = 1))+
theme(
panel.grid.minor = element_blank(),
legend.title = element_blank(),
axis.title.x = element_blank()
)
```
# Reviews Distribution
bestsellers %>%
ggplot(aes(Reviews, ..density..)) +
geom_histogram(fill = “#ff9900”, color = ‘black’,binwidth = 1000) +
geom_density(alpha = 0.5, fill = “#ff9900”) +
theme_minimal() +
theme(
panel.grid.major = element_blank())
# User Ratings vs Year
## Highest Rated Books
bestsellers %>%
select(Name, User.Rating) %>%
arrange(desc(User.Rating)) %>%
head(20) %>%
distinct()
```
## Are users rating the bestsellers differently by year?
### Linear Model
bestsellers %>%
lm(formula = Year ~ Reviews) %>%
summary()
```
### Data Viz
bestsellers %>%
ggplot(aes(Year, User.Rating, color = Genre))+
geom_jitter()+
geom_smooth(method = ‘lm’, formula = y ~ x )+
theme_minimal() +
scale_color_manual(values = c(“#ff9900”, “#000000”)) +
scale_x_continuous(breaks = seq(2009, 2019, by = 1))+
theme(
panel.grid.minor = element_blank(),
legend.title = element_blank(),
axis.title.x = element_blank()
)
```
# Ratings Distribution
bestsellers %>%
ggplot(aes(User.Rating, ..density..)) +
geom_histogram(fill = “#ff9900”,
color = ‘black’,
binwidth = 0.1) +
geom_density(alpha = 0.5, fill = “#ff9900”) +
theme_minimal() +
theme(panel.grid.major = element_blank())
# Ratings vs Reviews
ggplot(bestsellers) +
aes(
x = Reviews,
y = User.Rating,
colour = Genre,
size = Reviews
) +
geom_jitter(alpha = 0.45) +
scale_color_manual(
values = c(Fiction = “#ff9900”,
`Non Fiction` = “#000000”)
) +
theme_minimal()
# Price
## Is there a substantial change in price over the years?
### Linear Model
bestsellers %>%
lm(formula = Year ~ Price) %>%
summary()bestsellers %>%
ggplot(aes(Year, Price, color = Genre))+
geom_jitter()+
geom_smooth(method = ‘lm’, formula = y ~ x )+
theme_minimal() +
scale_color_manual(values = c(“#ff9900”, “#000000”)) +
scale_x_continuous(breaks = seq(2009, 2019, by = 1))+
theme(
panel.grid.minor = element_blank(),
legend.title = element_blank(),
axis.title.x = element_blank()
)
```
# Price Distribution
bestsellers %>%
ggplot(aes(Price, ..density..)) +
geom_histogram(fill = “#ff9900”,
color = ‘black’,
binwidth = 1.3) +
geom_density(alpha = 0.5, fill = “#ff9900”) +
theme_minimal() +
theme(panel.grid.major = element_blank())
## Most Expensive Books
bestsellers %>%
select(Name, Price) %>%
arrange(desc(Price)) %>%
head(20) %>%
distinct() %>%
ggplot(aes(x =reorder(Name, Price), y = Price,
fill = ifelse(Price == max(Price), “red”,”grey”))) +
geom_col() +
coord_flip() +
scale_fill_manual(values = c(“#000000”, “#ff9900”)) +
theme_minimal() +
theme(
panel.grid.minor = element_blank(),
panel.grid.major.y = element_blank(),
legend.position = “none”,
axis.title.y = element_blank()
)
```
# Authors
# Most Instances on the best seller list
bestsellers %>%
group_by(Author) %>%
summarise(count = n()) %>%
arrange(desc(count)) %>%
head(10) %>%
ggplot(aes(x =reorder(Author, count), y = count,
fill = ifelse(count == max(count), “red”,”grey”))) +
scale_y_continuous(breaks = seq(0, 13, by = 1))+
geom_col() +
coord_flip() +
scale_fill_manual(values = c(“#000000”, “#ff9900”)) +
theme_minimal() +
theme(
panel.grid.minor = element_blank(),
panel.grid.major.y = element_blank(),
legend.position = “none”,
axis.title.y = element_blank()
)
### What were their books?
bestsellers %>%
filter(Author==”Jeff Kinney”)
# Books
## Most instances on the best sellers list
bestsellers %>%
group_by(Name) %>%
summarise(count = n()) %>%
arrange(desc(count)) %>%
head(10) %>%
ggplot(aes(x =reorder(Name, count), y = count,
fill = ifelse(count == max(count), “red”,”grey”))) +
scale_y_continuous(breaks = seq(0, 13, by = 1))+
geom_col() +
coord_flip() +
scale_fill_manual(values = c(“#000000”, “#ff9900”)) +
theme_minimal() +
theme(
panel.grid.minor = element_blank(),
panel.grid.major.y = element_blank(),
legend.position = “none”,
axis.title.y = element_blank()
)
```
# K — Means Clustering
data_kmeans <- bestsellers[3:6]
data_kmeans <- data_kmeans %>%
slice(1:20)
```
library(factoextra)data_kmeans <- bestsellers[3:6]
data_kmeans <- data_kmeans %>%
slice(1:20)data_kmeans_scaled <- scale(data_kmeans)
data_kmeans <- dist(data_kmeans_scaled)
fviz_nbclust(data_kmeans_scaled, kmeans,
method = “wss”) + # wss means within sum squares
labs(subtitle = “Elbow Method”)kmeans_output <- kmeans(data_kmeans_scaled, centers = 3, nstart = 100)
kmeans_output.clusters <- kmeans_output$cluster
bestsellers_mini <- bestsellers %>%
slice(1:20)
```rownames(data_kmeans_scaled) <- paste(bestsellers_mini$Name, 1:dim(bestsellers_mini)[1], sep=”_”)fviz_cluster(list(data = data_kmeans_scaled, cluster = kmeans_output.clusters)) +
theme_minimal() +
theme(plot.subtitle = element_text(size = 12,
face = “italic”),
plot.caption = element_text(size = 9),
plot.title = element_text(size = 15,
face = “bold”)
) + labs(
colour = “Cluster”,
fill = “Cluster”,
shape = “Cluster”,
)
```
test <- table(kmeans_output.clusters, bestsellers_mini$Name)
as.data.frame(test)
```
# Hierarchical clustering
res <- hcut(data_kmeans_scaled, k = 4, stand = TRUE)
# Visualize
fviz_dend(res, rect = TRUE, cex = 0.3,
k_colors = c(“#00AFBB”,”#2E9FDF”, “#E7B800”, “#FC4E07”)) +
theme_minimal()