A simple guide to K-Means in R

Karat Sidhu

6 min readMay 6, 2022

Using Kaggle.com Amazon Bestsellers Dataset

Analysis of the amazon bestsellers and their statistics using various R packages.

For this project I used Quarto instead of Rmarkdown.

Notebook available at:

Amazon Bestsellers

Loading libraries

suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(skimr))
suppressPackageStartupMessages(library(corrplot))
suppressPackageStartupMessages(library(RColorBrewer))

Loading Data

bestsellers <- read.csv(“bestsellers with categories.csv”)
head(bestsellers, 2)
str(bestsellers)
```

Exploring the Data

Column Names

bestsellers %>% colnames()

Checking for missing Values

any(is.na(bestsellers))

Data at a glance

unique(bestsellers$Year)
unique(bestsellers$Genre)

Data Distribution

skim(bestsellers)

## all data formats look good, no conversion needed for now

Data Visualization

# Genres

## Bestsellers distribution by genre

bestsellers %>%
 group_by(Genre) %>%
 summarise(books = n()) %>%
 ggplot(aes(x = “”, y = books, fill = Genre)) +
 geom_bar(stat = “identity”,
 width = 1,
 color = “white”) +
 coord_polar(“y”, start = 0) +
 theme_void() +
 scale_fill_manual(values = c(“#ff9900”, “#000000”))

Genres through the years

## Bestsellers distribution by genre by year

bestsellers %>%
 group_by(Year, Genre) %>%
 summarise(sum = n()) %>%
 ggplot(aes(
 x = Year,
 y = sum,
 fill = Genre
 )) +
 geom_col() +
 theme_minimal() +
 scale_x_continuous(breaks = seq(2009, 2019, by = 1)) +
 scale_fill_manual(values = c(“#ff9900”, “#000000”)) +
 theme(
 panel.grid.major.x = element_blank(),
 panel.grid.minor = element_blank(),
 axis.title.y = element_blank(),
 legend.title = element_blank(),
 axis.title.x = element_blank()
 )

Correlation Analysis

## Is there any correlation seen between the different numerical variables?

bestsellers %>%
 select(User.Rating, Reviews, Price, Year) %>%
 cor() %>%
 corrplot::corrplot(
 type = “lower”,
 order = “hclust”,
 method = “color”,
 addgrid.col = “darkgray”,
 outline = T,
 tl.cex = 1,
 tl.col = “black”,
 col = brewer.pal(n = 6, name = “RdGy”)
 )
```

# User Reviews vs Year

## Most reviewed books

bestsellers %>% 
 select(Name, Reviews) %>% 
 arrange(desc(Reviews)) %>% 
 head(20) %>% 
 distinct()
```

## Are the number of reviews changing by year?

### Linear Model

bestsellers %>% 
 lm(formula = Year ~ Reviews) %>% 
 summary()
```

### Data Viz

bestsellers %>% 
 ggplot(aes(Year, Reviews, color = Genre))+
 geom_jitter()+
 geom_smooth(method = ‘lm’, formula = y ~ x )+
 theme_minimal() +
 scale_color_manual(values = c(“#ff9900”, “#000000”)) +
 scale_x_continuous(breaks = seq(2009, 2019, by = 1))+
 theme(
 panel.grid.minor = element_blank(),
 legend.title = element_blank(),
 axis.title.x = element_blank()
 )
```

# Reviews Distribution

bestsellers %>% 
 ggplot(aes(Reviews, ..density..)) +
 geom_histogram(fill = “#ff9900”, color = ‘black’,binwidth = 1000) +
 geom_density(alpha = 0.5, fill = “#ff9900”) +
 theme_minimal() +
 theme(
 panel.grid.major = element_blank())

# User Ratings vs Year

## Highest Rated Books

bestsellers %>% 
 select(Name, User.Rating) %>% 
 arrange(desc(User.Rating)) %>% 
 head(20) %>% 
 distinct()
```

## Are users rating the bestsellers differently by year?

### Linear Model

bestsellers %>% 
 lm(formula = Year ~ Reviews) %>% 
 summary()
```

### Data Viz

bestsellers %>% 
 ggplot(aes(Year, User.Rating, color = Genre))+
 geom_jitter()+
 geom_smooth(method = ‘lm’, formula = y ~ x )+
 theme_minimal() +
 scale_color_manual(values = c(“#ff9900”, “#000000”)) +
 scale_x_continuous(breaks = seq(2009, 2019, by = 1))+
 theme(
 panel.grid.minor = element_blank(),
 legend.title = element_blank(),
 axis.title.x = element_blank()
 )
```

# Ratings Distribution

bestsellers %>%
 ggplot(aes(User.Rating, ..density..)) +
 geom_histogram(fill = “#ff9900”,
 color = ‘black’,
 binwidth = 0.1) +
 geom_density(alpha = 0.5, fill = “#ff9900”) +
 theme_minimal() +
 theme(panel.grid.major = element_blank())

# Ratings vs Reviews

ggplot(bestsellers) +
 aes(
 x = Reviews,
 y = User.Rating,
 colour = Genre,
 size = Reviews
 ) +
 geom_jitter(alpha = 0.45) +
 scale_color_manual(
 values = c(Fiction = “#ff9900”,
 `Non Fiction` = “#000000”)
 ) +
 theme_minimal()

# Price

## Is there a substantial change in price over the years?

### Linear Model

bestsellers %>% 
 lm(formula = Year ~ Price) %>% 
 summary()bestsellers %>% 
 ggplot(aes(Year, Price, color = Genre))+
 geom_jitter()+
 geom_smooth(method = ‘lm’, formula = y ~ x )+
 theme_minimal() +
 scale_color_manual(values = c(“#ff9900”, “#000000”)) +
 scale_x_continuous(breaks = seq(2009, 2019, by = 1))+
 theme(
 panel.grid.minor = element_blank(),
 legend.title = element_blank(),
 axis.title.x = element_blank()
 )
```

# Price Distribution

bestsellers %>%
 ggplot(aes(Price, ..density..)) +
 geom_histogram(fill = “#ff9900”,
 color = ‘black’,
 binwidth = 1.3) +
 geom_density(alpha = 0.5, fill = “#ff9900”) +
 theme_minimal() +
 theme(panel.grid.major = element_blank())

## Most Expensive Books

bestsellers %>% 
 select(Name, Price) %>% 
 arrange(desc(Price)) %>% 
 head(20) %>% 
 distinct() %>% 
 ggplot(aes(x =reorder(Name, Price), y = Price,
 fill = ifelse(Price == max(Price), “red”,”grey”))) +
 geom_col() +
 coord_flip() +
 scale_fill_manual(values = c(“#000000”, “#ff9900”)) +
 theme_minimal() +
 theme(
 panel.grid.minor = element_blank(),
 panel.grid.major.y = element_blank(),
 legend.position = “none”,
 axis.title.y = element_blank()
 )
```

# Authors

# Most Instances on the best seller list

bestsellers %>% 
 group_by(Author) %>% 
 summarise(count = n()) %>% 
 arrange(desc(count)) %>% 
 head(10) %>% 
 ggplot(aes(x =reorder(Author, count), y = count,
 fill = ifelse(count == max(count), “red”,”grey”))) +
 scale_y_continuous(breaks = seq(0, 13, by = 1))+
 geom_col() +
 coord_flip() +
 scale_fill_manual(values = c(“#000000”, “#ff9900”)) +
 theme_minimal() +
 theme(
 panel.grid.minor = element_blank(),
 panel.grid.major.y = element_blank(),
 legend.position = “none”,
 axis.title.y = element_blank()
 )

### What were their books?

bestsellers %>% 
 filter(Author==”Jeff Kinney”)

# Books

## Most instances on the best sellers list

bestsellers %>% 
 group_by(Name) %>% 
 summarise(count = n()) %>% 
 arrange(desc(count)) %>% 
 head(10) %>% 
 ggplot(aes(x =reorder(Name, count), y = count,
 fill = ifelse(count == max(count), “red”,”grey”))) +
 scale_y_continuous(breaks = seq(0, 13, by = 1))+
 geom_col() +
 coord_flip() +
 scale_fill_manual(values = c(“#000000”, “#ff9900”)) +
 theme_minimal() +
 theme(
 panel.grid.minor = element_blank(),
 panel.grid.major.y = element_blank(),
 legend.position = “none”,
 axis.title.y = element_blank()
 )
```

# K — Means Clustering

data_kmeans <- bestsellers[3:6]
data_kmeans <- data_kmeans %>% 
 slice(1:20)

```

library(factoextra)data_kmeans <- bestsellers[3:6]
data_kmeans <- data_kmeans %>% 
 slice(1:20)data_kmeans_scaled <- scale(data_kmeans)
data_kmeans <- dist(data_kmeans_scaled)
fviz_nbclust(data_kmeans_scaled, kmeans,
 method = “wss”) + # wss means within sum squares
 labs(subtitle = “Elbow Method”)kmeans_output <- kmeans(data_kmeans_scaled, centers = 3, nstart = 100)
kmeans_output.clusters <- kmeans_output$cluster

bestsellers_mini <- bestsellers %>% 
 slice(1:20)
```rownames(data_kmeans_scaled) <- paste(bestsellers_mini$Name, 1:dim(bestsellers_mini)[1], sep=”_”)fviz_cluster(list(data = data_kmeans_scaled, cluster = kmeans_output.clusters)) +
 theme_minimal() +
 theme(plot.subtitle = element_text(size = 12,
 face = “italic”),
 plot.caption = element_text(size = 9),
 plot.title = element_text(size = 15,
 face = “bold”)
) + labs(
 colour = “Cluster”,
 fill = “Cluster”,
 shape = “Cluster”,
)
```

test <- table(kmeans_output.clusters, bestsellers_mini$Name)
as.data.frame(test)
```

# Hierarchical clustering

res <- hcut(data_kmeans_scaled, k = 4, stand = TRUE)
# Visualize
fviz_dend(res, rect = TRUE, cex = 0.3,
 k_colors = c(“#00AFBB”,”#2E9FDF”, “#E7B800”, “#FC4E07”)) +
 theme_minimal()

A simple guide to K-Means in R

Amazon Bestsellers

Loading libraries

Loading Data

Exploring the Data

Data Visualization

Genres through the years

Correlation Analysis

# User Reviews vs Year

## Are the number of reviews changing by year?

# Reviews Distribution

# User Ratings vs Year

## Are users rating the bestsellers differently by year?

# Ratings Distribution

# Ratings vs Reviews

# Price

# Price Distribution

## Most Expensive Books

# Authors

# Most Instances on the best seller list

### What were their books?

# Books

# K — Means Clustering

# Hierarchical clustering

Written by Karat Sidhu

No responses yet