但愿历尽千帆,终能得偿所愿。
一、数据获取
土地市场数据一般会公示在当地的公共资源交易中心,但经常会出现只公示当周或当月数据的情况,因此,我们得去找专业的土地网站获取交易数据。比如土流网:
https://www.tudinet.com/market-0-0-0-0/
网站结构简单,简单的url翻页构造,然后用xpath解析提取数据即可。
爬虫代码如下:
import requests
from lxml import etree
import random
import time
import logging
import openpyxl
from datetime import datetime
wb = openpyxl.Workbook()
sheet = wb.active
sheet.append(['土地位置', '出让形式', '推出时间', '土地面积', '规划建筑面积', '土地地址', '成交状态', '土地代号', '规划用途'])
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s: %(message)s')
user_agent = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
start = datetime.now()
def get_info(page):
headers = {
"User-Agent": random.choice(user_agent)
}
url = f'https://www.tudinet.com/market-254-0-0-0/list-pg{page}.html'
resp = requests.get(url, headers=headers).text
time.sleep(1)
html = etree.HTML(resp)
lis = html.xpath('//div[@class="land-l-cont"]/dl')
for li in lis:
try:
location = li.xpath('.//dd/p[7]/text()')[0]
transfer_form = li.
xpath('.//dt/i/text()')[0]
launch_time = li.xpath('.//dd/p[1]/text()')[0]
land_area = li.xpath('.//dd/p[3]/text()')[0]
planning_area = li.xpath('.//dd/p[5]/text()')[0]
address = li.xpath('.//dd/p[4]/text()')[0]
state = li.xpath('.//dd/p[2]/text()')[0]
area_code = li.xpath('.//dt/span/text()')[0]
planned_use = li.xpath('.//dd/p[6]/text()')[0]
data = [location, transfer_form, launch_time, land_area, planning_area, address, state, area_code, planned_use]
sheet.append(data)
logging.info(data)
except Exception as e:
logging.info(e.args[0])
continue
def main():
for i in range(1, 101):
get_info(i)
logging.info(f'抓取第{i}页数据完毕')
time.sleep(random.uniform(1, 2))
wb.save(filename="real_estate_info.xlsx")
if __name__ == '__main__':
main()
delta = (datetime.now() - start).total_seconds()
print(f'数据抓取完毕,用时:{delta}')
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
- 63
- 64
- 65
- 66
- 67
- 68
- 69
- 70
- 71
- 72
- 73
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
- 63
- 64
- 65
- 66
- 67
- 68
- 69
- 70
- 71
- 72
- 73
运行爬虫代码,提取到成都地区3158块土地数据,结果如下:
三、数据查看
数据比较干净和完整,可以直接用于数据分析。
三、分析土地交易数据
1. 土地出让形式&土地成交状态
import pandas as pd
from pyecharts import options as opts
from pyecharts.charts import Pie
from pyecharts.globals import CurrentConfig, ThemeType
CurrentConfig.ONLINE_HOST = 'D:/python/pyecharts-assets-master/assets/'
df = pd.read_excel('real_estate_info.xlsx').loc[:, ['出让形式', '成交状态']]
df1 = df['出让形式'].value_counts()
df2 = df['成交状态'].value_counts()
data_pair_1 = [(i, int(j)) for i, j in zip(df1.index, df1.values)]
data_pair_2 = [(i, int(j)) for i, j in zip(df2.index, df2.values)]
c = (
Pie(init_opts=opts.InitOpts(theme=ThemeType.DARK, width="1100px", height="500px"))
.add(
"土地出让形式",
data_pair_1,
center=["25%",
"50%"],
label_opts=opts.LabelOpts(is_show=True),
)
.set_colors(['red', 'blue', 'purple'])
.add(
"土地成交状态",
data_pair_2,
center=["70%", "50%"],
label_opts=opts.LabelOpts(is_show=True),
)
.set_global_opts(title_opts=opts.TitleOpts(title="土地出让形式&土地成交状态占比"),
legend_opts=opts.LegendOpts(is_show=False)
)
.set_series_opts(
tooltip_opts=opts.TooltipOpts(
trigger="item", formatter="{a} <br/>{b}: {c} ({d}%)"
)
)
.render("pie_.html")
)
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
统计分析,并用pyecharts饼图可视化。已有的数据中,从 2015年9月 到 2020年2月,成都土地出让形式:挂牌出让占比67.73%、拍卖出让占比31.45%,只有很少一部分是招标出让,仅占比0.82%,成都土地招拍挂未成交和流拍土地占比不到一半,而已成交土地占比高达65.77%,整体成交率较高,原因可能为有意向竞拍人数量多、出价比较合适。
2. 土地交易面积
import pandas as pd
import pyecharts.options as opts
from pyecharts.charts import Bar
from pyecharts.globals import CurrentConfig, ThemeType
CurrentConfig.ONLINE_HOST = 'D:/python/pyecharts-assets-master/assets/'
df = pd.read_excel('real_estate_info.xlsx').loc[:, ['推出时间', '土地面积', '规划建筑面积']]
date = df['推出时间'].str.split('年', expand=
True)[0]
df['年份'] = date
df['土地面积'] = df['土地面积'].str[:-1].map(float)
df['规划建筑面积'] = df['规划建筑面积'].str[:-1].map(float)
land_area = df.groupby('年份').agg({'土地面积': 'sum'}) / 10000
planned_area = df.groupby('年份').agg({'规划建筑面积': 'sum'}) / 10000
print(land_area, type(land_area))
print(planned_area, type(planned_area))
years = [int(y) for y in land_area.index[1:-1]]
ydata_1 = [float('{:.2f}'.format(i)) for i in land_area['土地面积'][1:-1]]
ydata_2 = [float('{:.2f}'.format(j)) for j in planned_area['规划建筑面积'][1:-1]]
bar = (
Bar(init_opts=opts.InitOpts(theme=ThemeType.DARK))
.add_xaxis(xaxis_data=years)
.add_yaxis(
series_name='土地面积(万m²)',
yaxis_data=ydata_1,
label_opts=opts.LabelOpts(is_show=False)
)
.add_yaxis(
series_name='规划建筑面积(万m²)',
yaxis_data=ydata_2,
label_opts=opts.LabelOpts(is_show=False)
)
.set_global_opts(
xaxis_opts=opts.AxisOpts(name='年份'),
yaxis_opts=opts.AxisOpts(name='万m²')
)
.set_series_opts(markpoint_opts=opts.MarkPointOpts(
data=[
opts.MarkPointItem(type_="max", name="最大值"),
opts.
MarkPointItem(type_="min", name="最小值"),
]),
)
.render('bar_.html')
)
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
从2016年到2019年,土地交易面积逐年增加,2018土地交易面积开始达到高潮,该年总的规划建筑面积为4156.15万m²,之后2019年土地交易面积较2018年有所下降。
import pandas as pd
from pyecharts import options as opts
from pyecharts.charts import Bar
from pyecharts.globals import CurrentConfig, ThemeType
CurrentConfig.ONLINE_HOST = 'D:/python/pyecharts-assets-master/assets/'
df = pd.read_excel('real_estate_info.xlsx').loc[:, ['推出时间', '土地面积', '规划建筑面积']]
df['土地面积'] = df['土地面积'].str[:-1].map(float)
df['规划建筑面积'] = df['规划建筑面积'].str[:-1].map(float)
date = df['推出时间'].str.split('月', expand=True)[0]
date = date.apply
(lambda x: x + '月')
df['月份'] = date
df1 = df[(df['推出时间'].str[:4] == '2020') | (df['推出时间'].str[:4] == '2019')]
df2 = df1.groupby('月份').agg({'土地面积': 'sum'}) / 10000
df3 = df1.groupby('月份').agg({'规划建筑面积': 'sum'}) / 10000
month = df2.index.tolist()
ydata_1 = [float('{:.2f}'.format(i)) for i in df2['土地面积']]
ydata_2 = [float('{:.2f}'.format(j)) for j in df3['规划建筑面积']]
bar = (
Bar(init_opts=opts.InitOpts(theme=ThemeType.DARK))
.add_xaxis(xaxis_data=month)
.add_yaxis(
series_name='土地面积(万m²)',
yaxis_data=ydata_1,
stack='stack1',
label_opts=opts.LabelOpts(is_show=False)
)
.add_yaxis(
series_name='规划建筑面积(万m²)',
yaxis_data=ydata_2,
stack='stack1',
label_opts=opts.LabelOpts(is_show=False)
)
.reversal_axis()
.set_global_opts(
xaxis_opts=opts.AxisOpts(name='万m²'),
yaxis_opts=opts.AxisOpts(name='月份')
)
.render('reverse_bar.html')
)
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
从2019年1月到2020年2月各月份上土地交易面积来看,2019年成都土地交易市场比较活跃,土地交易面积起伏较大,2019年12月规划建筑面积为817.47万m²,达到峰值,之后2020年开始,1、2月土地交易面积下降较多,部分原因可能是受年初国内新冠疫情爆发的影响。
3. 交易土地的规划用途
import pandas as pd
from pyecharts.charts import Radar
from pyecharts import options as opts
from pyecharts.globals import CurrentConfig, ThemeType
CurrentConfig.ONLINE_HOST = 'D:/python/pyecharts-assets-master/assets/'
df = pd.read_excel('real_estate_info.xlsx')['规划用途']
datas = df.value_counts()
items = datas.index.tolist()
colors = ['#FF0000', '#FF4500', '#00FA9A', '#FFFFF0', '#FFD700']
labels = [opts.RadarIndicatorItem(name=items[i], max_=50, color=colors[i]) for i in range(len(items))]
value = [int(j) for j in datas.values]
radar = (
Radar(init_opts=opts.InitOpts(theme=ThemeType.DARK))
.add_schema(
schema=labels
)
.add(
series_name='土地规划用途占比(%)',
data = [[round((x / sum(value)) * 100, 3) for x in value]],
areastyle_opts=opts.AreaStyleOpts(opacity=0.5, color='blue')
)
.set_global_opts(
)
.render('radar.html')
)
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
-
15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
交易土地的用途主要以工业用地为主,工业用地占比高达43.667%,还有相当一部分比例用作商业/办公用地、综合用地、其他用地,住宅用地仅占比5.098%。也从侧面反应出成都注重工业的发展,搜索一些资料了解到,“十二五”期间,成都工业年均增速约14.4%,在15个副省级城市中排名首位,有力支撑了成都地区生产总值迈上“万亿”台阶。
4. 土地成交区域
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib as mpl
df = pd.read_excel('real_estate_info.xlsx')
area = df['土地位置']
with open('test.txt', encoding='utf-8') as f:
areas = f.read().split('、')
for item in areas:
df[item] = [eval(df.loc[x, '规划建筑面积'][:-1]) if item in df.loc[x, '土地位置'] else 0 for x in range(len(df['土地位置']))]
date = df['推出时间'].str.split('年', expand=True)[0]
df['年份'] = date
df1 = df[areas]
df1.index = df['年份']
df2 = df1.groupby('年份').sum()
datas = np.array(df2.iloc[:5, ::].T)
print(datas, type(datas))
x_label = [year for year in range(2015, 2020)]
y_label = areas
mpl.rcParams['font.family'] = 'Kaiti'
fig, ax = plt.subplots(figsize=(
15, 9))
sns.heatmap(data=df2.iloc[:5, ::].T, linewidths=0.25,
linecolor='black', ax=ax, annot=True,
fmt='.1f', cmap='OrRd', robust=True,
)
ax.set_xlabel('年份', fontdict={'size': 18, 'weight': 'bold'})
ax.set_ylabel('行政区', fontdict={'size': 18, 'weight': 'bold'})
ax.set_title(r'各行政区2015-2019年的总规划建筑面积(平方米)', fontsize=25, x=0.5, y=1.02)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_visible(False)
plt.savefig('heat_map.png')
plt.show()
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
从交易区域来看,除双流县和郫县,各行政区每年都有一定土地成交,龙泉驿区和青白江区2018年到2019年交易土地面积最大,土地交易市场火热。
四、其他说明
-
本文数据分析只做学习研究之用,提供的结论仅供参考
-
不足之处,请多多指正