i want to run the streaming k-means-example.scala code source (mllib) on spark , someone tell me how i can how I can display the content of clusters after clustering (for example i want to clustering data into 3 clusters , how i can display the cntent of the 3 clusters in 3 files and the content of centers in file.txt)
package org.apache.spark.examples.mllib
import org.apache.spark.SparkConf
import org.apache.spark.mllib.clustering.StreamingKMeans
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.streaming.{Seconds, StreamingContext}
object StreamingKMeansExample {
def main(args: Array[String]) {
if (args.length != 5) {
System.err.println( "Usage: StreamingKMeansExample " +
"<trainingDir> <testDir> <batchDuration> <numClusters> <numDimensions>")
System.exit(1)
}
val conf = new SparkConf().setMaster("localhost").setAppName
("StreamingKMeansExample")
val ssc = new StreamingContext(conf, Seconds(args(2).toLong))
val trainingData = ssc.textFileStream(args(0)).map(Vectors.parse)
val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse)
val model = new StreamingKMeans().setK(args(3).toInt)
.setDecayFactor(1.0)
.setRandomCenters(args(4).toInt, 0.0)
model.trainOn(trainingData)
model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print()
ssc.start()
ssc.awaitTermination()
You would have to use the predict method on your RDD( look here for reference) Then you could zip your Rdd containing values and your RDD of predicted clusters they fall in.