display the content of clusters after clustering in streaming-k-means.scala code source in spark

738 views Asked by At

i want to run the streaming k-means-example.scala code source (mllib) on spark , someone tell me how i can how I can display the content of clusters after clustering (for example i want to clustering data into 3 clusters , how i can display the cntent of the 3 clusters in 3 files and the content of centers in file.txt)

           package org.apache.spark.examples.mllib

           import org.apache.spark.SparkConf
           import org.apache.spark.mllib.clustering.StreamingKMeans
           import org.apache.spark.mllib.linalg.Vectors
           import org.apache.spark.mllib.regression.LabeledPoint
           import org.apache.spark.streaming.{Seconds, StreamingContext}  


           object StreamingKMeansExample {

        def main(args: Array[String]) {
        if (args.length != 5) {
        System.err.println(  "Usage: StreamingKMeansExample " +
  "<trainingDir> <testDir> <batchDuration> <numClusters>       <numDimensions>")
     System.exit(1)
       }
    val conf = new SparkConf().setMaster("localhost").setAppName
     ("StreamingKMeansExample")
    val ssc = new StreamingContext(conf, Seconds(args(2).toLong))

val trainingData = ssc.textFileStream(args(0)).map(Vectors.parse)
val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse)

 val model = new StreamingKMeans().setK(args(3).toInt)
 .setDecayFactor(1.0)
 .setRandomCenters(args(4).toInt, 0.0)

   model.trainOn(trainingData)
 model.predictOnValues(testData.map(lp => (lp.label,   lp.features))).print()

 ssc.start()
  ssc.awaitTermination()
1

There are 1 answers

0
Anant On

You would have to use the predict method on your RDD( look here for reference) Then you could zip your Rdd containing values and your RDD of predicted clusters they fall in.