南宁市网站建设_网站建设公司_动画效果_seo优化
2026/1/1 11:08:55 网站建设 项目流程

前言

由于各个软件版本不同,本篇文章只做参考。如果你有幸在大数据考试时,看到了这篇文章,可以参考步骤,当然其他实现的技术,思路也是可以的,比如其他路径如python。甚至不用hadoop

实验要求

Result文件数据说明:

Ip:106.39.41.166,(城市)

Date:10/Nov/2016:00:01:02 +0800,(日期)

Day:10,(天数)

Traffic: 54 ,(流量)

Type: video,(类型:视频video或文章article)

Id: 8701(视频或者文章的id)

测试要求:

  • 数据清洗:按照进行数据清洗,并将清洗后的数据导入hive数据库中。

两阶段数据清洗:

(1)第一阶段:把需要的信息从原始日志中提取出来

ip:    199.30.25.88

time:  10/Nov/2016:00:01:03 +0800

traffic:  62

文章: article/11325

视频: video/3235

(2)第二阶段:根据提取出来的信息做精细化操作

ip--->城市 city(IP)

date--> time:2016-11-10 00:01:03

day: 10

traffic:62

type:article/video

id:11325

(3)hive数据库表结构:

create table data(  ip string,  time string , day string, traffic bigint,

type string, id   string )

2、数据分析:在HIVE统计下列数据。

(1)统计最受欢迎的视频/文章的Top10访问次数 (video/article)

(2)按照地市统计最受欢迎的Top10课程 (ip)

(3)按照流量统计最受欢迎的Top10课程 (traffic)

3、数据可视化:

将统计结果倒入MySql数据库中,通过图形化展示的方式展现出来。

 

第一阶段

这里使用到了ip2region模块,需要下载ip2region_v4.xdb。这个模块的使用可以参考我另一篇博客:数据分析中 使用ip2region来进行 ip地址转换为城市 - 雨花阁 - 博客园

image

导入依赖:

<dependency><groupId>org.lionsoul</groupId><artifactId>ip2region</artifactId><version>2.6.5</version>
</dependency>

MR程序:

DataCleaner
package o;import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.lionsoul.ip2region.xdb.Searcher;
import java.io.*;
import java.text.SimpleDateFormat;
import java.util.Date;public class DataCleaner {// 第一阶段:从CSV格式提取基本信息public static class FirstStageMapper extends Mapper<LongWritable, Text, Text, Text> {private final Text outputKey = new Text();private final Text outputValue = new Text();@Overrideprotected void map(LongWritable key, Text value, Context context)throws IOException, InterruptedException {String line = value.toString().trim();if (line.isEmpty()) return;// 解析CSV格式:ip,date,day,traffic,type,id// 示例:106.39.41.166,10/Nov/2016:00:01:02 +0800,10,54 ,video,8701String[] parts = line.split(",");if (parts.length >= 6) {try {String ip = parts[0].trim();String dateTime = parts[1].trim();String day = parts[2].trim();String traffic = parts[3].trim();String type = parts[4].trim();String id = parts[5].trim();// 输出格式:ip|dateTime|traffic|type|idString output = ip + "|" + dateTime + "|" + traffic + "|" + type + "|" + id;outputKey.set(ip);outputValue.set(output);context.write(outputKey, outputValue);} catch (Exception e) {System.err.println("Error parsing line: " + line);e.printStackTrace();}} else {System.err.println("Invalid line format: " + line + " (expected 6 parts, got " + parts.length + ")");}}}// 第二阶段:精细化处理public static class SecondStageReducer extends Reducer<Text, Text, Text, Text> {private Searcher searcher;private final SimpleDateFormat inputFormat = new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss Z");private final SimpleDateFormat outputFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");@Overrideprotected void setup(Context context) throws IOException {try {// 从classpath读取ip2region数据库文件System.out.println("DEBUG: Loading ip2region db from classpath");InputStream in = getClass().getClassLoader().getResourceAsStream("ip2region_v4.xdb");if (in == null) {throw new IOException("ip2region_v4.xdb not found in classpath");}// 读取文件到内存byte[] buffer = new byte[4096];java.io.ByteArrayOutputStream baos = new java.io.ByteArrayOutputStream();int bytesRead;while ((bytesRead = in.read(buffer)) != -1) {baos.write(buffer, 0, bytesRead);}byte[] dbBinStr = baos.toByteArray();in.close();baos.close();System.out.println("DEBUG: File size: " + dbBinStr.length + " bytes");System.out.println("DEBUG: Bytes read: " + dbBinStr.length);// 创建搜索器searcher = Searcher.newWithBuffer(dbBinStr);System.out.println("DEBUG: ip2region searcher initialized successfully");} catch (Exception e) {System.err.println("Failed to initialize ip2region searcher: " + e.getMessage());throw new IOException(e);}}@Overrideprotected void reduce(Text key, Iterable<Text> values, Context context)throws IOException, InterruptedException {for (Text value : values) {String[] parts = value.toString().split("\\|");if (parts.length == 5) {try {String ip = parts[0];String rawTime = parts[1];String traffic = parts[2];String type = parts[3];String id = parts[4];// 1. IP转换城市String city = getCityFromIP(ip);// 2. 日期格式转换Date date = inputFormat.parse(rawTime);String formattedTime = outputFormat.format(date);// 3. 提取天数(直接从日期中提取,而不是从输入中获取)String day = new SimpleDateFormat("dd").format(date);// 4. 清理流量(去除非数字字符)String cleanTraffic = traffic.replaceAll("[^0-9]", "");if (cleanTraffic.isEmpty()) {cleanTraffic = "0";}// 构建输出字符串String output = String.format("%s,%s,%s,%s,%s,%s",city, formattedTime, day, cleanTraffic, type, id);context.write(null, new Text(output));} catch (Exception e) {System.err.println("Error processing record: " + value.toString());e.printStackTrace();}}}}private String getCityFromIP(String ip) {try {// 使用ip2region查询城市信息String region = searcher.search(ip);System.out.println("DEBUG: IP=" + ip + ", Region=" + region);// 解析返回结果:国家|省份|城市|ISPString[] regionParts = region.split("\\|");System.out.println("DEBUG: RegionParts length=" + regionParts.length + ", Parts=" + java.util.Arrays.toString(regionParts));String location = "未知";if (regionParts.length >= 3) {String country = regionParts[0];String province = regionParts[1];String city = regionParts[2];System.out.println("DEBUG: Country=" + country + ", Province=" + province + ", City=" + city);// 返回城市信息,优先使用城市,如果没有则使用省份if (!"0".equals(city) && !"-".equals(city)) {location = city;} else if (!"0".equals(province) && !"-".equals(province)) {location = province;} else if (!"0".equals(country) && !"-".equals(country)) {location = country;}}// 返回"城市(IP)"格式return String.format("%s(%s)", location, ip);} catch (Exception e) {System.err.println("Failed to search IP " + ip + ": " + e.getMessage());// 异常情况下返回"未知(IP)"格式return String.format("未知(%s)", ip);}}@Overrideprotected void cleanup(Context context) throws IOException {if (searcher != null) {try {searcher.close();} catch (Exception e) {System.err.println("Failed to close searcher: " + e.getMessage());}}}}// 主函数public static void main(String[] args) throws Exception {if (args.length < 2) {System.err.println("Usage: o.DataCleaner <input> <output>");System.exit(1);}Configuration conf = new Configuration();Job job = Job.getInstance(conf, "Data Cleaner Two Stages");job.setJarByClass(DataCleaner.class);// 设置Mapper和Reducerjob.setMapperClass(FirstStageMapper.class);job.setReducerClass(SecondStageReducer.class);// 设置输出类型job.setOutputKeyClass(Text.class);job.setOutputValueClass(Text.class);// 设置输入输出路径FileInputFormat.addInputPath(job, new Path(args[0]));FileOutputFormat.setOutputPath(job, new Path(args[1]));// 设置Reducer数量job.setNumReduceTasks(1);System.exit(job.waitForCompletion(true) ? 0 : 1);}
}

第二阶段

Hivesql语句,注意在最后一个数据分析的sql中是有错误的,并不能将城市区分开,后面看了可视化的就懂了。

-- 创建数据库(如果不存在)
CREATE DATABASE IF NOT EXISTS log_analysis;-- 使用数据库
USE log_analysis;DROP TABLE IF EXISTS data;
-- 创建数据表
CREATE TABLE IF NOT EXISTS data (ip_city STRING COMMENT 'IP归属城市',`time` STRING COMMENT '访问时间',day STRING COMMENT '访问天数',traffic BIGINT COMMENT '流量消耗',type STRING COMMENT '内容类型',id STRING COMMENT '内容ID'
)ROW FORMAT DELIMITEDFIELDS TERMINATED BY ','STORED AS TEXTFILELOCATION '/user/hadoops/output';-- 1. 最受欢迎的内容Top10(按访问次数)CREATE TABLE top10_content_by_visits_detail AS
SELECTtype AS content_type,id AS content_id,COUNT(*) AS visit_count,ROUND(COUNT(*) * 100.0 / total.total_count, 3) AS percentage_of_total,RANK() OVER (ORDER BY COUNT(*) DESC) AS overall_rank,DENSE_RANK() OVER (ORDER BY COUNT(*) DESC) AS dense_rank
FROM dataCROSS JOIN (SELECT COUNT(*) AS total_count FROM data) total
GROUP BY type, id, total.total_count
ORDER BY visit_count DESC
LIMIT 10;
-- 3. 按流量详细统计Top10课程
CREATE TABLE top_courses_by_traffic_detail AS
SELECTid AS course_id,type AS content_type,SUM(traffic) AS total_traffic,COUNT(*) AS visit_count,ROUND(AVG(traffic), 2) AS avg_traffic_per_visit,MIN(traffic) AS min_traffic_per_visit,MAX(traffic) AS max_traffic_per_visit,ROUND(SUM(traffic) * 100.0 / total.total_traffic, 4) AS traffic_percentage,ROUND(COUNT(*) * 100.0 / total.total_visits, 4) AS visit_percentage,RANK() OVER (ORDER BY SUM(traffic) DESC) AS traffic_rank,DENSE_RANK() OVER (ORDER BY SUM(traffic) DESC) AS dense_traffic_rank
FROM dataCROSS JOIN (SELECTSUM(traffic) AS total_traffic,COUNT(*) AS total_visitsFROM data
) total
GROUP BY id, type, total.total_traffic, total.total_visits
ORDER BY total_traffic DESC
LIMIT 15;-- 按照地市统计最受欢迎的Top10(出现城市最多的前10个)
CREATE TABLE top10_cities_by_visits_detail AS
WITH city_extracted AS (SELECT-- 提取城市名称,从ip_city字段中提取括号前的内容CASEWHEN ip_city LIKE '%市%' THENTRIM(SUBSTRING_INDEX(ip_city, '(', 1))ELSE ip_cityEND AS city_name,type AS content_type,id AS content_id,traffic,1 as visit_countFROM dataWHERE ip_city IS NOT NULL AND ip_city != ''
),city_aggregated AS (SELECTcity_name,COUNT(*) AS total_visits,COUNT(DISTINCT content_id) AS unique_content_count,SUM(traffic) AS total_traffic,ROUND(COUNT(*) * 100.0 / (SELECT COUNT(*) FROM data), 4) AS visit_percentage,ROUND(AVG(traffic), 2) AS avg_traffic_per_visit,MIN(traffic) AS min_traffic,MAX(traffic) AS max_trafficFROM city_extractedGROUP BY city_name),total_metrics AS (SELECTSUM(total_visits) AS grand_total_visits,SUM(total_traffic) AS grand_total_trafficFROM city_aggregated)
SELECTca.city_name,ca.total_visits,ca.unique_content_count,ca.total_traffic,ca.avg_traffic_per_visit,ca.min_traffic,ca.max_traffic,ca.visit_percentage,ROUND(ca.total_traffic * 100.0 / tm.grand_total_traffic, 4) AS traffic_percentage,RANK() OVER (ORDER BY ca.total_visits DESC) AS city_rank,DENSE_RANK() OVER (ORDER BY ca.total_visits DESC) AS city_dense_rank,PERCENT_RANK() OVER (ORDER BY ca.total_visits DESC) AS city_percent_rank,CUME_DIST() OVER (ORDER BY ca.total_visits DESC) AS city_cume_dist
FROM city_aggregated caCROSS JOIN total_metrics tm
ORDER BY ca.total_visits DESC
LIMIT 10;

可视化比较简单,采用springboot+vue+mysql实现

image

 

需要专业的网站建设服务?

联系我们获取免费的网站建设咨询和方案报价,让我们帮助您实现业务目标

立即咨询