MouRyou's Space Station

全国大学生创业服务网项目按高校分类查询

字数统计: 754阅读时长: 3 min
2019/02/10 Share

前言

这是本站的第一篇博客,本来想写些环境搭建的,但是基本上所需的环境我都已经搭建好了,以后如果遇到再说吧。
过年在家闲的无聊,到全国大学生创业服务网上随便看看找点灵感,想看看自己学校有哪些项目,发现其导航分类竟然没有按高校分,因此我写了个小爬虫按高校分类抓数据。

正文

对数据内容进行分类查询下一页等操作发现网站url无变化,心想这又是要抓Ajax数据啊😭
不过简单分析下发现这个网站Ajax数据好像可以直接抓😂,没有难度。

因此就决定用Selenium来抓,就当复习其用法了。

爬虫过程

代码(python3.6)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import csv
import requests
from lxml import etree
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By


class PioneerSpider(object):
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36"
}
driver_path = r"/home/laen/program/chromedriver_linux64/chromedriver"

def __init__(self):
self.driver = webdriver.Chrome(executable_path=self.driver_path)
self.url = "http://cy.ncss.org.cn/search/projects#"
fp = open("pioneer.csv", "a") # 把数据以csv格式保存
self.writer = csv.DictWriter(fp, ["name", "school", "desc", "url"])
self.writer.writeheader()

def run(self):
self.driver.get(self.url)
# 等加载
WebDriverWait(driver=self.driver, timeout=10).until(
EC.presence_of_element_located((By.XPATH, "//ul[@class='map-item-box']")))
other_btn = self.driver.find_elements_by_xpath("//span[contains(@class, 'glyphicon')]")[1]
other_btn.click()

self.school = input("请输入想要查询的学校:") # 按学校分类查询
self.province = input("请输入学校所在省份(直辖市或地区):") # 筛一下地区不然效率低

# 等加载
WebDriverWait(driver=self.driver, timeout=10).until(
EC.presence_of_element_located((By.XPATH, "//ul[@class='map-item-box']")))
province_btn = self.driver.find_element_by_link_text(self.province)
if province_btn:
province_btn.click()
else:
self.province = input("请输入正确的省份(直辖市或地区):")

# 循环点击下一页
while True:
WebDriverWait(driver=self.driver, timeout=10).until(EC.presence_of_element_located((By.XPATH, "//div[@class='search-list-item']")))
source = self.driver.page_source
self.parse_list_page(source)
WebDriverWait(driver=self.driver, timeout=10).until(
EC.presence_of_element_located((By.XPATH, "//div[@class='pagination']/a")))
next_btn = self.driver.find_element_by_xpath("//div[@class='pagination']/a[@class='next']")
if next_btn:
next_btn.click()
else:
break

def parse_list_page(self, source):
"""抓目标项目的详情页url"""
html = etree.HTML(source)
divs = html.xpath("//div[@class='search-list-item']/div[@class='project-list-info']")
for div in divs:
schools = div.xpath(".//div[@class='project-list-item-tags-text']/span[1]/text()")
for school in schools:
if school == self.school:
link = div.xpath("./a/@href")[0]
link = "http://cy.ncss.org.cn" + link
self.parse_detail_page(link)
else:
continue

def parse_detail_page(self, link):
"""分析详情页"""
dic = {}
reponses = requests.get(link, headers=self.headers)
text = reponses.text

html = etree.HTML(text)
name = html.xpath("//div[@class='banner-top clearfix']/div[@class='info-box']/h4/text()")[0]
desc = html.xpath("//div[@id='project']//p//text()")
desc = "".join(desc)

dic["name"] = name
dic["school"] = self.school
dic["desc"] = desc
dic["url"] = link

self.writer_dic(dic)

def writer_dic(self, dic):
"""写入数据"""
self.writer.writerow(dic)
print(dic)
print("=" * 100)


if __name__ == '__main__':
spider = PioneerSpider()
spider.run()

小结

这次主要目的是复习Selenium的用法,其实还可以加个多线程提高点效率再连接下MySQL太懒了,还是下次吧 ),另外那个网站数据加载速度有点感人,要注意保证数据加载完成不然会报错。

CATALOG
  1. 1. 前言
  2. 2. 正文
    1. 2.1. 爬虫过程
    2. 2.2. 代码(python3.6)
  3. 3. 小结