介绍

这次内容太多了，各个环节关联性都比较强。如果将各个部分放在一起讲解，会造成混乱。因此把【python与GIS数据处理3】分成6个部分。中间迭代了几个版本，才保证各个环节的连贯性。

具体内容如下：

安装包部分和数据部分；
nc切片保存为tiff文件；
高纬度矩阵的求和和求均值操作；
坐标系的转换；
基于时间范围提取数据；
对特定区域进行裁切；
基于1～6的整体应用。

代码

代码我已经放在Github上面了，免费分享使用，https://github.com/yuanzhoulvpi2017/tiny_python/tree/main/python_GIS。

效果：

中国平均气温和世界平均气温的对比

特定时间范围内的可视化

批量导出tiff文件

具体的tiff打开是这样的：

并且也裁切了单独的中国地图版本：

获得数据可以直接在公众号后台回复：气象数据

导入包

# 基础的数据处理工具
import numpy as np
import pandas as pd

# 可视化
import matplotlib.pyplot as plt

# 处理python时间函数
import datetime

# 处理nc数据
import netCDF4 as nc
from netCDF4 import num2date

# 处理网格数据，shp之类的
import geopandas as gpd

# 处理tiff文件
import rasterio

# gis的一些逻辑判断
from shapely.geometry import Point

# 设置投影坐标系等
from cartopy import crs as ccrs

# 打印进度条
from tqdm import tqdm

tqdm.pandas()

# 检测系统

import platform

# matplotlib 显示中文的问题
if platform.system() == 'Darwin':



    
    plt.rcParams["font.family"] = 'Arial Unicode MS'
elif platform.system() == 'Windows':
    plt.rcParams["font.family"] = 'SimHei'
else:
    pass

加载并且处理数据

加载nc文件和读取数据

nc_data = nc.Dataset("cru_ts4.05.1901.2020.tmp.dat.nc")

提取变量

raw_lat_data = np.array(nc_data.variables['lat'])
raw_lon_data = np.array(nc_data.variables['lon'])
raw_time_data = np.array(nc_data.variables['time'])
raw_tmp_data = np.array(nc_data.variables['tmp'])

# 变量的缺失值
tmp_missing_value = nc_data.variables['tmp'].missing_value

# 对原始的缺失值做替换，替换成numpy可以识别的缺失值

raw_tmp_data[raw_tmp_data == tmp_missing_value] = np.nan

对时间处理

def cftime2datetime(cftime, units, format='%Y-%m-%d %H:%M:%S'):
    """
    将nc文件里面的时间格式 从cftime 转换到 datetime格式
    :param cftime:
    :param units:
    :param format:
    :return:
    """
    return datetime.datetime.strptime(num2date(times=cftime, units=units).strftime(format), format)


clean_time_data = pd.Series([cftime2datetime(i, units='days since 1900-1-1') for i in raw_time_data])
clean_time_data[:4]

处理边界数据

导入中国边界数据

china_boundary = gpd.read_file(filename="数据集/中国地图边界202111版.json")

china_boundary_valid = china_boundary.copy()
china_boundary_valid['geometry'] = china_boundary.buffer(0)

计算相关数据


# 计算中心点
center_china_point = china_boundary_valid.centroid[0]
center_china_point.x, center_china_point.y

## 设置投影


# 原来的投影系统
raw_crs = china_boundary_valid.crs

# 新的兰伯特投影
new_crs = ccrs.LambertConformal(central_longitude=center_china_point.x,
                                central_latitude=center_china_point.y)

基础计算部分

计算中国地图点的mask

def pic(lon, lat) -> bool:
    """
    检测一个点是否在中国边界线内
    lon:东经
    lat:北纬
    :param lon:
    :param lat:
    :return:
    """
    


    
return china_boundary_valid.contains(Point(lon, lat))[0]


mask_for_china = np.full(shape=raw_tmp_data.shape[1:], fill_value=False)

for index_lat in tqdm(range(raw_lat_data.shape[0])):
    for index_lon in range(raw_lon_data.shape[0]):
        point = (raw_lon_data[index_lon], raw_lat_data[index_lat])
        mask_for_china[index_lat, index_lon] = pic(point[0], point[1])

基础函数部分

将矩阵保存到tiff文件中

def array2gtiff(array, filename):
    """
    将一个矩阵保存为tiff文件,
    这里还可以设置tiff的crs和transofrm。更多，可以查看rasterio的官网或者下面的这个链接
    https://gis.stackexchange.com/questions/279953/numpy-array-to-gtiff-using-rasterio-without-source-raster
    :param array: shape:(row, col)
    :param filename:
    :return:
    """
    with rasterio.open(filename, 'w', driver='GTiff',
                       height=array.shape[0], width=array.shape[1],
                       count=1, dtype=str(array.dtype)) as f:
        f.write(array, 1)


# 保存mask，测试我们的函数都对不对
array2gtiff(array=mask_for_china.astype('float64')[::-1, :],
            filename="结果/test_mask.tiff")

需求部分

全世界的年月变化折线图；
中国的年月变化折线图；
全世界年变化折线图；
中国年变化折线图；
全世界每一年的平均数保存为tiff文件；
中国每一年的平均数保存为tiff文件；
特定时间范围内的数据保存为tiff文件（包括中国和全世界两种）
特定时间范围内的且中国范围内的数据求平均数可视化（使用兰伯特投影）

需求1和需求2

全世界的年月变化折线图；
中国的年月变化折线图；

need_1 = pd.DataFrame()
need_1['date'] = clean_time_data

need_1['world_value'] = [np.nanmean(raw_tmp_data[i, :, :]) for i in tqdm(range(raw_tmp_data.shape[0]))]

need_1['china_value'] = [np.nanmean(raw_tmp_data[i, :, :][mask_for_china]) for i in tqdm(range(raw_tmp_data.shape[0]))]
need_1.head()

fig, ax = plt.subplots(figsize=(10, 4), dpi=150)
ax.plot(need_1['date'], need_1['world_value'], label='世界平均温度')
ax.plot(need_1['date'], need_1['china_value'], label='中国平均温度')
ax.legend()

需求3和4

全世界年变化折线图；
中国年变化折线图；

def cal_need34(year, type):
    out = np.nanmean(raw_tmp_data[clean_time_data.dt.year == year, :, :], axis=0)
    if type == 'world':
        value = np.nanmean(out)
    elif type == 'china':
        value = np.nanmean(out[mask_for_china])
    else:
        value = None



    
    return value


# test function

cal_need34(year=2001, type='china'), cal_need34(year=2001, type='world')

#%%

year = np.arange(start=np.min(clean_time_data.dt.year),
                 stop=np.max(clean_time_data.dt.year) + 1)

need_34 = pd.DataFrame({'year': year})
need_34['china_value'] = need_34['year'].apply(lambda x: cal_need34(year=x, type='china'))
need_34['world_value'] = need_34['year'].apply(lambda x: cal_need34(year=x, type='world'))
need_34

def smooth_data(y_value, deg=4):
    x_new = np.arange(y_value.shape[0])
    new_param = np.polyfit(x_new, y_value, deg=deg)
    value = np.zeros_like(x_new)
    for index, param in enumerate(new_param[::-1]):
        value = value + param * x_new ** index
    return value


need_34['smooth_china_value'] = smooth_data(y_value=need_34['china_value'])
need_34['smooth_world_value'] = smooth_data(y_value=need_34['world_value'])

fig, ax = plt.subplots(figsize=(10, 4), dpi=150)
ax.plot(need_34['year'], need_34['world_value'], label='世界平均温度', linestyle='-', marker='o')
ax.plot(need_34['year'], need_34['china_value'], label='中国平均温度', linestyle='-', marker='o')

# 增加拟合曲线
ax.plot(need_34['year'], need_34['smooth_china_value'], linestyle='--', color='gray')
ax.plot(need_34['year'], need_34['smooth_world_value'], linestyle='--', color='gray')

ax.set_title("每年平均气温")
ax.legend()
ax.set_xlabel("年份")
ax.set_ylabel("温度平均数$ ^\circ C $")
plt.tight_layout()
fig.savefig("结果/每年平均气温.png")

需求5和6

全世界每一年的平均数保存为tiff文件；
中国每一年的平均数保存为tiff文件；

def cal_need56(year, type):
    out = np.nanmean(raw_tmp_data[clean_time_data.dt.year == year, :, :], axis=0)
    if type == 'world':
        value = out
    elif type == 'china':
        out[~mask_for_china] = np.nan
        value = out
    else:
        value = None
    return value


out = cal_need56(year=2000, type='china')
array2gtiff(array=out[::-1, :], filename="结果/year2000.tiff")

out = cal_need56(year=2000, type='world')
array2gtiff(array=out[::-1, :], filename="结果/year2000_world.tiff")

#%%

for temp_year in tqdm(range(np.min(clean_time_data.dt.year), np.max(clean_time_data.dt.year) + 1)):
    out = cal_need56(year=temp_year, type='china')
    array2gtiff(array=out[::-1, :], filename=f"结果/year_{temp_year}_china.tiff")

    out = cal_need56(year=temp_year, type='world')
    array2gtiff(array=out[::-1, :], filename=f"结果/year_{temp_year}_world.tiff")

需求7

特定时间范围内的数据保存为tiff文件（包括中国和全世界两种）




    
def cal_need7(start_date, end_date, type):
    out = np.nanmean(raw_tmp_data[(start_date <= clean_time_data) & (clean_time_data <= end_date), :, :], axis=0)
    if type == 'world':
        value = out
    elif type == 'china':
        out[~mask_for_china] = np.nan
        value = out
    else:
        value = None
    return value

out = cal_need7(start_date='1902-01', end_date='2020-01', type='world')
array2gtiff(array=out[::-1, :], filename="结果/data190201_202001_world.tiff")

out = cal_need7(start_date='1902-01', end_date='2020-01', type='china')
array2gtiff(array=out[::-1, :], filename="结果/data190201_202001_china.tiff")

需求8

特定时间范围内的且中国范围内的数据求平均数可视化（使用兰伯特投影）

#%%

out_8 = cal_need7(start_date='1902-01', end_date='2020-01', type='china')


#%%

Lon_data, Lat_data = np.meshgrid(raw_lon_data, raw_lat_data)

need_8_df = pd.DataFrame({'value':out_8.flatten(),
                          'mask_china':mask_for_china.flatten(),
                          'lon':Lon_data.flatten(),
                          'lat':Lat_data.flatten()})
need_8_df = need_8_df.loc[need_8_df['mask_china']]
need_8_df

#%%

need_8_df_gd = gpd.GeoDataFrame(
    need_8_df,
    geometry=gpd.points_from_xy(x=need_8_df['lon'], y=need_8_df['lat']),
    crs= raw_crs#china_boundary.crs
)
need_8_df_gd = need_8_df_gd.to_crs(new_crs.proj4_init)
need_8_df_gd

#%%

china_boundary_valid_new_crs = china_boundary_valid.to_crs(new_crs.proj4_init)
china_boundary_valid_new_crs

#%%

fig, ax = plt.subplots(figsize=(8, 7), dpi=150, subplot_kw={'projection': new_crs})
ax.gridlines(draw_labels=True,
             linewidth=2, color='gray', alpha=0.5, linestyle='--')
china_boundary_valid_new_crs.boundary.plot(ax=ax, color='black', marker='s')
need_8_df_gd.plot(ax=ax, column='value', cmap=plt.cm.get_cmap('RdYlBu'), legend=True)
ax.set_xlabel("longitude")
ax.set_ylabel("latitude")
ax.set_title(f"时间范围为: 1902-01 ~ 202-01")
plt.tight_layout()

fig.savefig("结果/中国可视化.png")

#%%

7600字详细介绍python处理nc文件的6种处理（阅读需要20分钟）

介绍