Auth: 王海飞
Data:2018-06-16
Email:779598160@qq.com
github:https://github.com/coco369/knowledge
思路:
案例1:获取电影的分类信息,针对一个分类,启动一个线程。即有多少分类的url,启动多少个线程
案例2:获取电影的分类的信息,将分类的url储存在一个列表中,启动两个线程去从列表中获取要访问的url,然后爬取url的内容,在进行数据分析即可。(使用线程锁)
import urllib.request
from urllib import parse
import json
import time
import threading
import pymongo
"""
获取豆瓣电影中的电影资源
豆瓣电影url地址:https://movie.douban.com/explore#!type=movie&tag=%E7%83%AD%E9%97%A8&sort=recommend&page_limit=20&page_start=0
分析:
1. 该页面中的的电影资源信息都是通过ajax异步加载进行刷新出来的
2. 在F12下的network中过滤XHR(XMLHTTPRESPONSE)请求,可以查看到真正的异步的请求地址如下
https://movie.douban.com/j/search_subjects?type=movie&tag=%E7%83%AD%E9%97%A8&sort=recommend&page_limit=20&page_start=20
3. 正在的请求地址中,type为类型,tag为标签(热门、经典、最新、爱情、科幻等等),sort为排序,page_limit为每一个的条数,page_start为开始的条数下标
4. 获取tag类型的url地址为: https://movie.douban.com/j/search_tags?type=movie&source=
"""
def urllib_open(url):
"""
公共的处理代码
"""
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
}
req = urllib.request.Request(url=url, headers=header)
res = urllib.request.urlopen(req)
return res.read().decode('utf-8')
def get_movie_tag(url):
"""
获取电影的分类tag
"""
tag_res = urllib_open(url)
# 返回的tag_res的结果为'{"tags":["热门","最新","经典","可播放","豆瓣高分","冷门佳片","华语","欧美","韩国","日本","动作","喜剧","爱情","科幻","悬疑","恐怖","成长"]}'
# 其结果为一个字符串类型的数据,需要将之转化为字典类型的
result = json.loads(tag_res)
content = result['tags']
return content
def get_movies(movies_url, db):
# movies_url中指定电影类型的参数是tag=热门或者最新等等
# db 是mongo的对象,可以操作mongo数据库
movies_res = urllib_open(movies_url)
res = json.loads(movies_res)
result = res['subjects']
for res in result:
db.movies2.insert_one({
'm_name': res['title'],
'm_rate': res['rate']
})
print('标题:%s,评分:%s' % (res['title'], res['rate']))
def main():
# 设置数据库的访问
mongo_client = pymongo.MongoClient('mongodb://45.76.206.145:27017')
db = mongo_client.douban
tag_url = 'https://movie.douban.com/j/search_tags?type=movie&source='
movies_url = 'https://movie.douban.com/j/search_subjects?type=movie&%s&sort=recommend&page_limit=20&page_start=0'
tag_content = get_movie_tag(tag_url)
threading_list = []
result_list = []
for tag in tag_content:
search_url = movies_url
data = {'tag': tag}
search_tag = parse.urlencode(data)
result_list.append(search_url % (search_tag,))
for url in result_list:
t = threading.Thread(target=get_movies, args=(url, db))
threading_list.append(t)
for thread in threading_list:
thread.start()
# thread.join()
if __name__ == '__main__':
print(time.clock())
main()
print(time.clock())
使用线程锁
import threading
import requests
import json
from urllib import parse
def get_html(url):
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
}
res = requests.get(url, headers=header)
return res.text
def get_movie_tag(url):
"""
获取电影的分类tag
"""
tag_res = get_html(url)
# 返回的tag_res的结果为'{"tags":["热门","最新","经典","可播放","豆瓣高分","冷门佳片","华语","欧美","韩国","日本","动作","喜剧","爱情","科幻","悬疑","恐怖","成长"]}'
# 其结果为一个字符串类型的数据,需要将之转化为字典类型的
result = json.loads(tag_res)
content = result['tags']
return content
class SpiderOperation(threading.Thread):
def __init__(self):
super(SpiderOperation, self).__init__()
self.task_lock = threading.Lock() # 线程锁
def update_task_list(self):
"""
多线程操作共享的类对象资源,互斥访问,
将每个线程处理的结果存入self.task_result_list
"""
if self.task_lock.acquire():
print(len(task_result_list))
link = task_result_list.pop() if task_result_list else ''
self.task_lock.release()
return link
def run(self):
task_link = self.update_task_list()
print(task_link)
if task_link:
movies_res = get_html(task_link)
res = json.loads(movies_res)
result = res['subjects']
for res in result:
print('标题:%s,评分:%s' % (res['title'], res['rate']))
if __name__ == '__main__':
tag_url = 'https://movie.douban.com/j/search_tags?type=movie&source='
movie_url = 'https://movie.douban.com/j/search_subjects?type=movie&%s&sort=recommend&page_limit=20&page_start=0'
tags = get_movie_tag(tag_url)
global task_result_list
task_result_list = []
for tag in tags:
search_url = movie_url
data = {'tag': tag}
search_tag = parse.urlencode(data)
# 搜索出需要爬取的豆瓣分类的url地址
task_result_list.append(search_url % (search_tag,))
while True:
if task_result_list:
spider1 = SpiderOperation()
spider2 = SpiderOperation()
spider1.start()
spider2.start()
else:
break