## 问题1: 现在有一张 country_populartion 表,包含:id,country,city,popularation 字段,请使用 sql 和 Python 统计每个城市的人口占比,统计指定城市的人口占比?
sql
"SELECT
country,
city,
population,
ROUND(
(population * 100.0) / SUM(population) OVER(), 2
) AS percentage
FROM country_population
ORDER BY population DESC;
"
python
"import pandas as pd
# 假设 df 是从数据库读取的 DataFrame
# df = pd.read_sql("SELECT * FROM country_population", connection)
# 计算总人口
total_pop = df['population'].sum()
# 计算占比
df['percentage'] = (df['population'] / total_pop) * 100
df['percentage'] = df['percentage'].round(2) # 保留两位小数
print(df[['country', 'city', 'population', 'percentage']])
"
## 问题2.现在有一张 country_populartion 表,包含:id,country,city,popularation 字段,统计指定城市的人口占比?
sql
"-- 方案 A:将指定城市合并为一行计算总占比
SELECT
'Top_Cities' AS group_name,
SUM(population) AS total_pop,
ROUND(
(SUM(population) * 100.0) / (SELECT SUM(population) FROM country_population), 2
) AS percentage
FROM country_population
WHERE city IN ('北京', '上海', '深圳');
-- 方案 B:列出指定城市,并计算它们各自及合计的占比
SELECT
city,
population,
ROUND(
(population * 100.0) / (SELECT SUM(population) FROM country_population), 2
) AS percentage
FROM country_population
WHERE city IN ('北京', '上海', '深圳');
"
python
"
import pandas as pd
# 假设 df 是原始数据
# df = pd.read_sql("SELECT * FROM country_population", connection)
# 定义你感兴趣的指定城市列表
target_cities = ['北京', '上海', '深圳']
# 筛选出指定城市的数据
df_target = df[df['city'].isin(target_cities)].copy()
# 计算总人口(用于做分母)
total_population = df['population'].sum()
# 计算每个指定城市的占比
df_target['percentage'] = (df_target['population'] / total_population) * 100
df_target['percentage'] = df_target['percentage'].round(2)
# 如果你想计算这“几个城市”加起来的总占比:
combined_percentage = df_target['population'].sum() / total_population * 100
print(f"指定城市({target_cities})总人口占比: {combined_percentage:.2f}%")
# 显示各个城市的占比详情
print(df_target[['city', 'population', 'percentage']])
"
3.统计每个国家人口总数排名前二的城市
sql
"SELECT
country,
city,
population
FROM (
SELECT
country,
city,
population,
ROW_NUMBER() OVER (PARTITION BY country ORDER BY population DESC) AS rn
FROM country_population
WHERE population IS NOT NULL -- 排除空值
) ranked
WHERE rn <= 2
ORDER BY country, rn;
"
python
"import pandas as pd
# 假设 df 是从数据库读取的数据
# df = pd.read_sql("SELECT * FROM country_population", connection)
# 1. 数据清洗:去除人口为空的行
df = df.dropna(subset=['population'])
# 2. 按国家分组,组内按人口降序排序,并生成排名
df['rank'] = df.groupby('country')['population'].rank(method='first', ascending=False)
# 3. 筛选排名前2的城市
result = df[df['rank'] <= 2]
# 4. 选择需要展示的列并排序
result = result[['country', 'city', 'population', 'rank']].sort_values(['country', 'rank'])
print(result)
"