Can't apply k means algorithm to my dataset Error in do_one(nmeth) : NA/NaN/Inf in foreign function call

819 views Asked by At

So I am trying to estimate the actual number of clusters with the fviz_nbclust function but it doesn't stop showing me this error:

Error in do_one(nmeth) : NA/NaN/Inf in foreign function call (arg 1)

In addition warning messages:

1: In stats::dist(x) : NAs introduced by coercion

2: In storage.mode(x) <- "double" : NAs introduced by coercion

I have used sum(is.na(stand_numeric_data$variable)) for all the columns of my dataset and it returns 0 for all the variables so I am assuming I don't have NA values. Any tips? I am new to programming so any suggestion would be appreciated.

movies_data <- read.csv("movies_metadata.csv", na.string = "True")

only_numeric <- movies_data %>% select(16, 17, 23, 24, 21) #subset of columns 

only_numeric <- subset(only_numeric, grepl('^\\d+$', only_numeric$revenue))

only_numeric <- subset(only_numeric, grepl('^\\d+$', only_numeric$runtime))

only_numeric <- subset(only_numeric, grepl('^\\d+$', only_numeric$vote_average))

only_numeric <- subset(only_numeric, grepl('^\\d+$', only_numeric$vote_count))

library(caret) #standardization

preproc1 <- preProcess(only_numeric[,c(1:4,5)], method=c("center", "scale"))

stand_numeric_data <- predict(preproc1, only_numeric[,c(1:4,5)])

sum(is.na(stand_numeric_data$revenue))

library(factoextra) #estimate the actual number of clusters 

fviz_nbclust(stand_numeric_data, kmeans, method = "wss")

Error in do_one(nmeth) : NA/NaN/Inf in foreign function call (arg 1)

In addition warning messages:

1: In stats::dist(x) : NAs introduced by coercion

2: In storage.mode(x) <- "double" : NAs introduced by coercion

dput(head(movies_data, 5))
structure(list(adult = c("False", "False", "False", "False", 
"False"), belongs_to_collection = c("{'id': 10194, 'name': 'Toy Story Collection', 'poster_path': '/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg', 'backdrop_path': '/9FBwqcd9IRruEDUrTdcaafOMKUq.jpg'}", 
"", "{'id': 119050, 'name': 'Grumpy Old Men Collection', 'poster_path': '/nLvUdqgPgm3F85NMCii9gVFUcet.jpg', 'backdrop_path': '/hypTnLot2z8wpFS7qwsQHW1uV8u.jpg'}", 
"", "{'id': 96871, 'name': 'Father of the Bride Collection', 'poster_path': '/nts4iOmNnq7GNicycMJ9pSAn204.jpg', 'backdrop_path': '/7qwE57OVZmMJChBpLEbJEmzUydk.jpg'}"
), budget = c("30000000", "65000000", "0", "16000000", "0"), 
    genres = c("[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]", 
    "[{'id': 12, 'name': 'Adventure'}, {'id': 14, 'name': 'Fantasy'}, {'id': 10751, 'name': 'Family'}]", 
    "[{'id': 10749, 'name': 'Romance'}, {'id': 35, 'name': 'Comedy'}]", 
    "[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'name': 'Drama'}, {'id': 10749, 'name': 'Romance'}]", 
    "[{'id': 35, 'name': 'Comedy'}]"), homepage = c("http://toystory.disney.com/toy-story", 
    "", "", "", ""), id = c("862", "8844", "15602", "31357", 
    "11862"), imdb_id = c("tt0114709", "tt0113497", "tt0113228", 
    "tt0114885", "tt0113041"), original_language = c("en", "en", 
    "en", "en", "en"), original_title = c("Toy Story", "Jumanji", 
    "Grumpier Old Men", "Waiting to Exhale", "Father of the Bride Part II"
    ), overview = c("Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences.", 
    "When siblings Judy and Peter discover an enchanted board game that opens the door to a magical world, they unwittingly invite Alan -- an adult who's been trapped inside the game for 26 years -- into their living room. Alan's only hope for freedom is to finish the game, which proves risky as all three find themselves running from giant rhinoceroses, evil monkeys and other terrifying creatures.", 
    "A family wedding reignites the ancient feud between next-door neighbors and fishing buddies John and Max. Meanwhile, a sultry Italian divorcée opens a restaurant at the local bait shop, alarming the locals who worry she'll scare the fish away. But she's less interested in seafood than she is in cooking up a hot time with Max.", 
    "Cheated on, mistreated and stepped on, the women are holding their breath, waiting for the elusive \"good man\" to break a string of less-than-stellar lovers. Friends and confidants Vannah, Bernie, Glo and Robin talk it all out, determined to find a better way to breathe.", 
    "Just when George Banks has recovered from his daughter's wedding, he receives the news that she's pregnant ... and that George's wife, Nina, is expecting too. He was planning on selling their home, but that's a plan that -- like George -- will have to change with the arrival of both a grandchild and a kid of his own."
    ), popularity = c("21.946943", "17.015539", "11.7129", "3.859495", 
    "8.387519"), poster_path = c("/rhIRbceoE9lR4veEXuwCC2wARtG.jpg", 
    "/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg", "/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg", 
    "/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg", "/e64sOI48hQXyru7naBFyssKFxVd.jpg"
    ), production_companies = c("[{'name': 'Pixar Animation Studios', 'id': 3}]", 
    "[{'name': 'TriStar Pictures', 'id': 559}, {'name': 'Teitler Film', 'id': 2550}, {'name': 'Interscope Communications', 'id': 10201}]", 
    "[{'name': 'Warner Bros.', 'id': 6194}, {'name': 'Lancaster Gate', 'id': 19464}]", 
    "[{'name': 'Twentieth Century Fox Film Corporation', 'id': 306}]", 
    "[{'name': 'Sandollar Productions', 'id': 5842}, {'name': 'Touchstone Pictures', 'id': 9195}]"
    ), production_countries = c("[{'iso_3166_1': 'US', 'name': 'United States of America'}]", 
    "[{'iso_3166_1': 'US', 'name': 'United States of America'}]", 
    "[{'iso_3166_1': 'US', 'name': 'United States of America'}]", 
    "[{'iso_3166_1': 'US', 'name': 'United States of America'}]", 
    "[{'iso_3166_1': 'US', 'name': 'United States of America'}]"
    ), release_date = c("1995-10-30", "1995-12-15", "1995-12-22", 
    "1995-12-22", "1995-02-10"), revenue = c(373554033, 262797249, 
    0, 81452156, 76578911), runtime = c(81, 104, 101, 127, 106
    ), spoken_languages = c("[{'iso_639_1': 'en', 'name': 'English'}]", 
    "[{'iso_639_1': 'en', 'name': 'English'}, {'iso_639_1': 'fr', 'name': 'Français'}]", 
    "[{'iso_639_1': 'en', 'name': 'English'}]", "[{'iso_639_1': 'en', 'name': 'English'}]", 
    "[{'iso_639_1': 'en', 'name': 'English'}]"), status = c("Released", 
    "Released", "Released", "Released", "Released"), tagline = c("", 
    "Roll the dice and unleash the excitement!", "Still Yelling. Still Fighting. Still Ready for Love.", 
    "Friends are the people who let you be yourself... and never let you forget it.", 
    "Just When His World Is Back To Normal... He's In For The Surprise Of His Life!"
    ), title = c("Toy Story", "Jumanji", "Grumpier Old Men", 
    "Waiting to Exhale", "Father of the Bride Part II"), video = c("False", 
    "False", "False", "False", "False"), vote_average = c(7.7, 
    6.9, 6.5, 6.1, 5.7), vote_count = c(5415L, 2413L, 92L, 34L, 
    173L)), row.names = c(NA, 5L), class = "data.frame")
summary(stand_numeric_data)

revenue           runtime          vote_average       vote_count     
 Min.   :-0.1114   Min.   :-2.10206   Min.   :-1.5192   Min.   :-0.1414  
 1st Qu.:-0.1114   1st Qu.:-0.20831   1st Qu.:-1.5192   1st Qu.:-0.1381  
 Median :-0.1114   Median : 0.08303   Median : 0.1963   Median :-0.1381  
 Mean   : 0.0000   Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000  
 3rd Qu.:-0.1114   3rd Qu.: 0.37438   3rd Qu.: 0.8825   3rd Qu.:-0.1248  
 Max.   :28.9583   Max.   :20.35581   Max.   : 1.9118   Max.   :29.3968  
    title          
 Length:11406      
 Class :character  
 Mode  :character  
2

There are 2 answers

1
UseR10085 On

I could able to reproduce your error using iris data set as

library(tidyverse)
library(factoextra)

str(iris) #To see the data types
summary(iris) #To see if there is NAs

#To get the elbow plot use iris data without character column i.e. Species
fviz_nbclust(iris[-5], kmeans, method = "wss")

#Introduce some NAs in iris dataset
df <- iris %>% 
  mutate(Petal.Length = na_if(Petal.Length, 1.4))

#Now run summary to see NAs
summary(df)

#Now fviz_nbclust gives the error you got
fviz_nbclust(df, kmeans, method = "wss")

Error in do_one(nmeth) : NA/NaN/Inf in foreign function call (arg 1) In addition: Warning messages: 1: In stats::dist(x) : NAs introduced by coercion 2: In storage.mode(x) <- "double" : NAs introduced by coercion

#Remove the rows containing NAs
df1 <- df[complete.cases(df), ]
#See the summary
summary(df1) #NAs are gone

#Scale and center the data
library(caret)

preproc1 <- preProcess(df1[,c(1:4)], method=c("center", "scale"))
stand_numeric_data <- predict(preproc1, df1[,c(1:4)])

#Now run fviz_nbclust without error
fviz_nbclust(stand_numeric_data, kmeans, method = "wss")
0
Kay On

Till I wait for a proper dataset, adapt the following to your use-case to identify where lie non-numeric values in your column. Where a is the column in your dataframe

library('Hmisc')
a <- c(NA, NA, 2, 3, 'aa')
sapply(a, all.is.numeric)

Output:

 <NA>  <NA>     2     3    aa 
FALSE FALSE  TRUE  TRUE FALSE 

You can read about the all.is.numeric function here: http://math.furman.edu/~dcs/courses/math47/R/library/Hmisc/html/all.is.numeric.html