package scalaimport org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SparkSession}object wordCountTest {def main(args: Array[String]): Unit = {// spark3.0 主流 用这种方法 创建spark 计算模型val spark = SparkSession.builder().master("local[*]") // 提交需要把 master 注释掉 在编译打包 就像.appName("wordCount").getOrCreate()import spark.implicits._val sc = spark.sparkContextval filename = if (args.length > 0) args(0).toString else "./data/graphx/users.txt"//读取文件var line: RDD[String] = sc.textFile(filename)//对文件进行分割val result1: RDD[String] = line.flatMap((item)=>{item.split(",")}).flatMap((itm)=>{itm.split(" ")})//把分割 单词 转为 (key,1)val rlt2: RDD[(String, Int)] = result1.map((itm2)=>{(itm2,1)})//根据分割元祖(key,1) 进行 分组统计val rsl3: RDD[(String, Int)] = rlt2.reduceByKey(_+_)//根据统计结果 进行 过滤val rsl4: RDD[(String, Int)] = rsl3.filter(itm=>itm._2>0 )// 把 rdd 转为 dataframe 方便展示val frame: DataFrame = rsl4.toDF("key","val")// rsl4.foreach(println)//展示数据处理结果frame.show()spark.stop()}
}
提交命令
spark-submit --class scala.wordCountTest --master yarn --queue root.dafault --deploy-mode client D:\hadoop\workCode\sparkhiveproject\target\sparkhiveproject-1.0-SNAPSHOT.jar 指定文件路径