A simple guide to K-Means in R

Karat Sidhu
6 min readMay 6, 2022

--

Using Kaggle.com Amazon Bestsellers Dataset

Photo by Thought Catalog on Unsplash

Analysis of the amazon bestsellers and their statistics using various R packages.

For this project I used Quarto instead of Rmarkdown.

Amazon Bestsellers

Loading libraries

suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(skimr))
suppressPackageStartupMessages(library(corrplot))
suppressPackageStartupMessages(library(RColorBrewer))

Loading Data

bestsellers <- read.csv(“bestsellers with categories.csv”)
head(bestsellers, 2)
str(bestsellers)
```

Exploring the Data

Column Names

bestsellers %>% colnames()

Checking for missing Values

any(is.na(bestsellers))

Data at a glance

unique(bestsellers$Year)
unique(bestsellers$Genre)

Data Distribution

skim(bestsellers)

## all data formats look good, no conversion needed for now

Data Visualization

# Genres

## Bestsellers distribution by genre

bestsellers %>%
group_by(Genre) %>%
summarise(books = n()) %>%
ggplot(aes(x = “”, y = books, fill = Genre)) +
geom_bar(stat = “identity”,
width = 1,
color = “white”) +
coord_polar(“y”, start = 0) +
theme_void() +
scale_fill_manual(values = c(“#ff9900”, “#000000”))

Genres through the years

## Bestsellers distribution by genre by year

bestsellers %>%
group_by(Year, Genre) %>%
summarise(sum = n()) %>%
ggplot(aes(
x = Year,
y = sum,
fill = Genre
)) +
geom_col() +
theme_minimal() +
scale_x_continuous(breaks = seq(2009, 2019, by = 1)) +
scale_fill_manual(values = c(“#ff9900”, “#000000”)) +
theme(
panel.grid.major.x = element_blank(),
panel.grid.minor = element_blank(),
axis.title.y = element_blank(),
legend.title = element_blank(),
axis.title.x = element_blank()
)

Correlation Analysis

## Is there any correlation seen between the different numerical variables?

bestsellers %>%
select(User.Rating, Reviews, Price, Year) %>%
cor() %>%
corrplot::corrplot(
type = “lower”,
order = “hclust”,
method = “color”,
addgrid.col = “darkgray”,
outline = T,
tl.cex = 1,
tl.col = “black”,
col = brewer.pal(n = 6, name = “RdGy”)
)
```

# User Reviews vs Year

## Most reviewed books

bestsellers %>% 
select(Name, Reviews) %>%
arrange(desc(Reviews)) %>%
head(20) %>%
distinct()
```

## Are the number of reviews changing by year?

### Linear Model

bestsellers %>% 
lm(formula = Year ~ Reviews) %>%
summary()
```

### Data Viz

bestsellers %>% 
ggplot(aes(Year, Reviews, color = Genre))+
geom_jitter()+
geom_smooth(method = ‘lm’, formula = y ~ x )+
theme_minimal() +
scale_color_manual(values = c(“#ff9900”, “#000000”)) +
scale_x_continuous(breaks = seq(2009, 2019, by = 1))+
theme(
panel.grid.minor = element_blank(),
legend.title = element_blank(),
axis.title.x = element_blank()
)
```

# Reviews Distribution

bestsellers %>% 
ggplot(aes(Reviews, ..density..)) +
geom_histogram(fill = “#ff9900”, color = ‘black’,binwidth = 1000) +
geom_density(alpha = 0.5, fill = “#ff9900”) +
theme_minimal() +
theme(
panel.grid.major = element_blank())

# User Ratings vs Year

## Highest Rated Books

bestsellers %>% 
select(Name, User.Rating) %>%
arrange(desc(User.Rating)) %>%
head(20) %>%
distinct()
```

## Are users rating the bestsellers differently by year?

### Linear Model

bestsellers %>% 
lm(formula = Year ~ Reviews) %>%
summary()
```

### Data Viz

bestsellers %>% 
ggplot(aes(Year, User.Rating, color = Genre))+
geom_jitter()+
geom_smooth(method = ‘lm’, formula = y ~ x )+
theme_minimal() +
scale_color_manual(values = c(“#ff9900”, “#000000”)) +
scale_x_continuous(breaks = seq(2009, 2019, by = 1))+
theme(
panel.grid.minor = element_blank(),
legend.title = element_blank(),
axis.title.x = element_blank()
)
```

# Ratings Distribution

bestsellers %>%
ggplot(aes(User.Rating, ..density..)) +
geom_histogram(fill = “#ff9900”,
color = ‘black’,
binwidth = 0.1) +
geom_density(alpha = 0.5, fill = “#ff9900”) +
theme_minimal() +
theme(panel.grid.major = element_blank())

# Ratings vs Reviews

ggplot(bestsellers) +
aes(
x = Reviews,
y = User.Rating,
colour = Genre,
size = Reviews
) +
geom_jitter(alpha = 0.45) +
scale_color_manual(
values = c(Fiction = “#ff9900”,
`Non Fiction` = “#000000”)
) +
theme_minimal()

# Price

## Is there a substantial change in price over the years?

### Linear Model

bestsellers %>% 
lm(formula = Year ~ Price) %>%
summary()
bestsellers %>%
ggplot(aes(Year, Price, color = Genre))+
geom_jitter()+
geom_smooth(method = ‘lm’, formula = y ~ x )+
theme_minimal() +
scale_color_manual(values = c(“#ff9900”, “#000000”)) +
scale_x_continuous(breaks = seq(2009, 2019, by = 1))+
theme(
panel.grid.minor = element_blank(),
legend.title = element_blank(),
axis.title.x = element_blank()
)
```

# Price Distribution

bestsellers %>%
ggplot(aes(Price, ..density..)) +
geom_histogram(fill = “#ff9900”,
color = ‘black’,
binwidth = 1.3) +
geom_density(alpha = 0.5, fill = “#ff9900”) +
theme_minimal() +
theme(panel.grid.major = element_blank())

## Most Expensive Books

bestsellers %>% 
select(Name, Price) %>%
arrange(desc(Price)) %>%
head(20) %>%
distinct() %>%
ggplot(aes(x =reorder(Name, Price), y = Price,
fill = ifelse(Price == max(Price), “red”,”grey”))) +
geom_col() +
coord_flip() +
scale_fill_manual(values = c(“#000000”, “#ff9900”)) +
theme_minimal() +
theme(
panel.grid.minor = element_blank(),
panel.grid.major.y = element_blank(),
legend.position = “none”,
axis.title.y = element_blank()
)
```

# Authors

# Most Instances on the best seller list

bestsellers %>% 
group_by(Author) %>%
summarise(count = n()) %>%
arrange(desc(count)) %>%
head(10) %>%
ggplot(aes(x =reorder(Author, count), y = count,
fill = ifelse(count == max(count), “red”,”grey”))) +
scale_y_continuous(breaks = seq(0, 13, by = 1))+
geom_col() +
coord_flip() +
scale_fill_manual(values = c(“#000000”, “#ff9900”)) +
theme_minimal() +
theme(
panel.grid.minor = element_blank(),
panel.grid.major.y = element_blank(),
legend.position = “none”,
axis.title.y = element_blank()
)

### What were their books?

bestsellers %>% 
filter(Author==”Jeff Kinney”)

# Books

## Most instances on the best sellers list

bestsellers %>% 
group_by(Name) %>%
summarise(count = n()) %>%
arrange(desc(count)) %>%
head(10) %>%
ggplot(aes(x =reorder(Name, count), y = count,
fill = ifelse(count == max(count), “red”,”grey”))) +
scale_y_continuous(breaks = seq(0, 13, by = 1))+
geom_col() +
coord_flip() +
scale_fill_manual(values = c(“#000000”, “#ff9900”)) +
theme_minimal() +
theme(
panel.grid.minor = element_blank(),
panel.grid.major.y = element_blank(),
legend.position = “none”,
axis.title.y = element_blank()
)
```

# K — Means Clustering

data_kmeans <- bestsellers[3:6]
data_kmeans <- data_kmeans %>%
slice(1:20)

```

library(factoextra)data_kmeans <- bestsellers[3:6]
data_kmeans <- data_kmeans %>%
slice(1:20)
data_kmeans_scaled <- scale(data_kmeans)
data_kmeans <- dist(data_kmeans_scaled)
fviz_nbclust(data_kmeans_scaled, kmeans,
method = “wss”) + # wss means within sum squares
labs(subtitle = “Elbow Method”)
kmeans_output <- kmeans(data_kmeans_scaled, centers = 3, nstart = 100)
kmeans_output.clusters <- kmeans_output$cluster
bestsellers_mini <- bestsellers %>% 
slice(1:20)
```
rownames(data_kmeans_scaled) <- paste(bestsellers_mini$Name, 1:dim(bestsellers_mini)[1], sep=”_”)fviz_cluster(list(data = data_kmeans_scaled, cluster = kmeans_output.clusters)) +
theme_minimal() +
theme(plot.subtitle = element_text(size = 12,
face = “italic”),
plot.caption = element_text(size = 9),
plot.title = element_text(size = 15,
face = “bold”)
) + labs(
colour = “Cluster”,
fill = “Cluster”,
shape = “Cluster”,
)
```
test <- table(kmeans_output.clusters, bestsellers_mini$Name)
as.data.frame(test)
```

# Hierarchical clustering

res <- hcut(data_kmeans_scaled, k = 4, stand = TRUE)
# Visualize
fviz_dend(res, rect = TRUE, cex = 0.3,
k_colors = c(“#00AFBB”,”#2E9FDF”, “#E7B800”, “#FC4E07”)) +
theme_minimal()

--

--

No responses yet