0%

爬虫学习笔记--GSMarena手机规格信息

目标:

  1. 获取GSMarena上所有手机规格页面,文件另存到本地
  2. 解析本地保存的手机规格页面,将规格信息输出到excel

过程代码:

首先将获取的网页数据保存

起始页地址:https://www.gsmarena.com/makers.php3

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
# -*- coding: utf-8 -*-
# 2022.02.11
# Lyu Yang
import os
import requests
from lxml import html
import time
import re


'''
遇到网站远程关闭,自动重启3次,失败后需手动重启,不再retry,避免被ban

'''

headers = {
'Host': 'm.gsmarena.com', # 需要根据目标站点编辑
# 'Host': '161.139.76.xxx', # 需要根据目标站点编辑
'Accept-Language': 'en-US,en;q=0.9,ko;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache',
'Upgrade-Insecure-Requests': '1',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.80 Safari/537.36 Edg/98.0.1108.43',
}

cookie = {} # 这个网站不用cookie也可以
with open('./cookie.txt', 'r') as f:
raw_cookies = f.read()
# print(raw_cookies)
for line in raw_cookies.split(';'):
key,value = line.split("=", 1)
cookie[key] = value # 一些格式化操作,用来装载cookies


proxies = {
# 'http':'xxx.xxx.xxx.xxx:8080'
}

def save(text, filename='temp', path='download'): # 保存字符串到文件
fpath = os.path.join(path, filename)
if not os.path.exists(path):
os.mkdir(path)
with open(fpath, 'wb') as f:
print('output:', fpath)
f.write(text)

def requrlstotree(url): # requset之后返回xpath解析的tree字符串
resp = requests.get(url, headers=headers)
if not resp.status_code == requests.codes.ok:
print('something wrong')
print(url)
return -1
time.sleep(5) # 获取完成,停顿一下
page = resp.content
tree = html.fromstring(page)
return tree

def requrlstocont(url): # request之后返回content
resp = requests.get(url, headers=headers) # urllib版本高会报错hostname error,版本改为1.25.11解决
print(resp.status_code)
# print(resp.content)
if not resp.status_code == requests.codes.ok:
print('something wrong')
print(url)
return -1
time.sleep(5) # 获取完成,停顿一下
page = resp.content
return page

def savelog(text): # 保存error log,记录有多少次retry
with open('./log.txt', 'a') as f:
f.write(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + ': ' + text + '\n')
print('log added')

def crawlurls():
# 获取所有品牌页面的url
url = 'https://www.gsmarena.com/makers.php3' # All mobile phone brands
tree = requrlstotree(url) # 解析包含所有手机品牌这个地址页
if tree == -1:
return -1
brands = tree.xpath('//tr//a/text()') #a标签下的text
print('total brands = ' + str(len(brands)))
urls = tree.xpath('//tr//a/@href')
print('total urls = ' + str(len(urls)))
brandsurls = {}
for i in range(len(brands)): # 把品牌和地址对应保存成字典,似乎品牌后面就没有用了,这里就当练习
urlsfull = 'https://' + headers['Host'] + '/' + urls[i] # 拼接地址,这个函数在其他网站上要对应修改,headers['Host']意义不大
brandsurls[brands[i]] = urlsfull
print(brands[i] + ' ' + urlsfull)

# 获取每一品牌手机所有型号页面url
models = []
for url in brandsurls.values(): # 遍历字典里面所有value
tree = requrlstotree(url) # 解析导航页
if tree == -1:
return -1
modelsPages = tree.xpath('//div[@class="nav-pages"]//a/@href') # 解析第2页开始的导航页地址
modelsPages = ['https://' + headers['Host'] + '/' + i for i in modelsPages]
modelsPages.insert(0, url) # 添加第一个导航页
print(modelsPages)
for url in modelsPages: #把每一个导航页中的每一部手机地址解析出来
tree = requrlstotree(url)
if tree == -1:
return -1
urls = tree.xpath('//div[@class="makers"]//li/a/@href')
urls = ['https://' + headers['Host'] + '/' + i for i in urls]
models.extend(urls)
print(models)
save('\n'.join(models), 'all_models_urls.txt', './') #保存所有地址

def crawlcontents():
# 判断下载是否完成,pages文件夹是否包含所有要下载的文件,首先set去掉已存在的文件的名称列表,针对下载过程中出错调试后的情况
print('filter exists files...')
with open('./all_models_urls.txt', 'r') as f: # 读取已经存在的文件
modelsurlslist = f.readlines()
modelslistall = [i.split('/')[-1] for i in modelsurlslist] # 保留最后的文件名+后缀
modelslistall = [i.replace('\n', '') for i in modelslistall] # 用readlines()读取的str里面有一个\n换行,要去掉
print('modelslist ' + str(len(modelslistall)))
modelslistexists = os.listdir('./pages/') # pages文件夹包含的文件列表
print('modelslistexists ' + str(len(modelslistexists)))
modelslistdiff = list(set(modelslistall).difference(set(modelslistexists))) # 做差集,因为保存文件时去掉特殊字符,有部分误差
modelslistdiff.sort() # set差集会打乱,猜测和算法有关,重新排序
print('modelslistdiff ' + str(len(modelslistdiff)))
print('special: ' + str(len(modelslistall) - len(modelslistexists) - len(modelslistdiff)))

for modelsfilename in modelslistdiff:
global count
# 判断下载是否完成,用os.path.exists判断特殊字符文件名,跳过
if re.sub('[\/:*?"<>|]', '_', modelsfilename) in modelslistexists:
print('special char: ' + modelsfilename)
# 剩余文件开始下载
else:
url = 'https://' + headers['Host'] + '/' + modelsfilename
# url = 'https://m.gsmarena.com/vivo_y20s_[g]-ampp-10847.php' # 这个bug就不纠结了,let it go...
print(url)
page = requrlstocont(url)
if page == -1: #针对被网站拒绝等情况,直接退出程序
return -1
filename = re.sub('[\/:*?"<>|]', '_', modelsfilename) # 过滤文件名中特殊字符
save(page, filename,'./pages/') #保存每一个页面文件

count = 0
# break # 调试用
return 0 # 全部完成返回0

if __name__ == '__main__':
# flag = crawlurls()
# flag = crawlcontents() # 调试用
flag = 1
count = 0 # 在函数中定义为全局变量
while(flag == 1): # 如果不是网站限制访问,出错尝试重启3次
try:
flag = crawlcontents()
except:
count += 1
print('try again')
savelog('try again')
time.sleep(60) # 暂停31秒,重连
if count > 2:
print('tried 3 times')
savelog('tried 3 times')
break
else:
if flag == -1:
savelog('something wrong')
print('something wrong')
break
else:
savelog('all done')
print('all done')
break

用Xpath解析网页并输出格式化的信息

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# -*- coding: utf-8 -*-
# 2022.02.12
from operator import mod
import os
from lxml import html
import pandas as pd


os.system('cls')
specnamelist = [ #value来自于对网页的解析
'modelname',
'nettech',
'year',
'status',
'dimensions',
'weight',
'displaytype',
'chipset',
'cam1modules',
'cam1features',
'cam1video',
'cam2modules',
'cam2video',
'sensors'
]

def getspec():
modelslist = os.listdir('./pages/') # 读取文件列表
datalist = []
modelslistlen = len(modelslist)
count = 0
print('filter the data...')
for fname in modelslist: # 遍历列表
templist = []
filepath = './pages/' + fname # 拼接文件路径
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f: #open默认编码GBK/cp936, errors='ignore'可以解决但不知道副作用
# print(f)
htmls = f.read()
tree = html.fromstring(htmls)
for name in specnamelist:
templist.append(''.join(tree.xpath('//*[@data-spec="' + name + '"]//text()'))) # 用''.join()把list转换成string,再放到templist里面
datalist.append(templist)
count += 1
if count%(int(modelslistlen/40)) == 0:
print(str(int(count/modelslistlen*100)) + '%', end='\r')
# print(templist) # print 影响速度
df = pd.DataFrame(datalist,columns=specnamelist) # pandas 保存到excel
df.to_excel('./GSMarena.xlsx','GSMarena',index=False)
print('100%')
print('Saved')

if __name__ == '__main__':
getspec()


with open('./all_models_urls.txt', 'r') as f:
modelslist = f.readlines()
print(len(modelslist))

参考文献:

  1. 如何入门 Python 爬虫?
  2. 爬虫解析库:XPath