Python爬取豆瓣电影top250排行榜

admin
🌐 经济型:买域名、轻量云服务器、用途:游戏 网站等 《腾讯云》特点:特价机便宜 适合初学者用 点我优惠购买
🚀 拓展型:买域名、轻量云服务器、用途:游戏 网站等 《阿里云》特点:中档服务器便宜 域名备案事多 点我优惠购买
🛡️ 稳定型:买域名、轻量云服务器、用途:游戏 网站等 《西部数码》 特点:比上两家略贵但是稳定性超好事也少 点我优惠购买

温馨提示:这篇文章已超过583天没有更新,请注意相关的内容是否还可用!

python爬取豆瓣电影top250排行榜示例代码,用的parsel和re两个模块,代码如下:


import requests

import csv

import re

import parsel

with open("豆瓣top250.csv",mode="w",encoding="utf_8_sig",newline='') as f:

csv_writer = csv.writer(f)

csv_writer.writerow(['片名','类型','评价人数',"上映时间",'导演_演员','国家','英文名','简介'])

#注意headers里面的大小写

headers={

'Cookie':'ll="118192"; bid=SxMSLUjm454; __utma=30149280.231185692.1663748575.1663748575.1663748575.1; __utmc=30149280; __utmz=30149280.1663748575.1.1.utmcsr=bAIdu|utmccn=(organic)|utmcmd=organic; __utmt=1; _pk_ref.100001.4cf6=["","",1663748581,"httPs://www.douban.com/"]; _pk_ses.100001.4cf6=*; __utmc=223695111; ap_v=0,6.0; __gads=ID=614eff214af342d2-221efc6e45d700a2:T=1663748581:RT=1663748581:S=ALNI_MY0JTwsKMOM9E6Uz_e8b88JW-wE9g; __gpi=UID=000009d31ea23134:T=1663748581:RT=1663748581:S=ALNI_MZ4KfeKbaWWs0Aeu0t5jqh2RD0IsA; Hm_lvt_16a14f3002af32bf3a75dfe352478639=1663748600; Hm_lpvt_16a14f3002af32bf3a75dfe352478639=1663748600; _vwo_uuid_v2=D913112FAEA958ABF7FCF7279209CA382|d77575243D8ec24.02f391aa8e5672ff; __utma=223695111.399175606.1663748581.1663748581.1663748653.2; __utmb=223695111.0.10.1663748653; __utmz=223695111.1663748653.2.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmb=30149280.2.10.1663748575; dbcl2="262975336:xPwIu1zP+eU"; ck=AvuC; push_noty_num=0; push_doumail_num=0; _pk_id.100001.4cf6=28fbd6e9a044e121.1663748581.1.1663748823.1663748581.',

'Referer':'https://www.baidu.com/link?url=P6mLfMtLSzXHxZYitwSc9UDnuTlARc-CJk-15rb3SfSKZlZQcjj-36ER1uqKcs1bl0s-eI6n1Onsaydsdu9zc_&wd=&eqid=b5b5e8680002085b00000005632acac7',

'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36'

}

for i in range(10):

url=f'https://movie.douban.com/top250?start={25*i}&filter='

response=requests.get(url=url,headers=headers)

# print(response.text)

selector=parsel.Selector(response.text)

title=selector.xpath(('//*[@id="content"]/div/div[1]/ol/li/div/div[2]/div[1]/a/span[1]/text()')).getall()

introduction=selector.xpath(('//*[@id="content"]/div/div[1]/ol/li/div/div[2]/div[2]/p[2]/span/text()')).getall()

judge_num=selector.xpath(('//*[@id="content"]/div/div[1]/ol/li/div/div[2]/div[2]/div/span[4]/text()')).getall()

director_actor=re.findall(' <p class="">s(.*?)<br>',response.text)


sum=selector.xpath('//*[@id="content"]/div/div[1]/ol/li/div/div[2]/div[2]/p[1]/text()[2]').getall()

englishname=selector.xpath('//*[@id="content"]/div/div[1]/ol/li/div/div[2]/div[1]/a/span[2]/text()').getall()

for i in range(25):

Title=title[i]

mm=sum[i].strip()

year = mm.split('/')[0]

country = mm.split('/')[1]

type = mm.split('/')[2]

Director=director_actor[i].strip()

Introduction=introduction[i].strip()

Englishname=englishname[i].strip()

Judge_num=judge_num[i].strip()

with open("豆瓣top250.csv",mode="a",encoding="utf-8_sig",newline='') as f:

csv_writer = csv.writer(f)

csv_writer.writerow([Title,type,Judge_num,year,Director,country,Englishname,Introduction])


文章版权声明:除非注明,否则均为执刀人的工具库原创文章,转载或复制请以超链接形式并注明出处。

发表评论

快捷回复: 表情:
AddoilApplauseBadlaughBombCoffeeFabulousFacepalmFecesFrownHeyhaInsidiousKeepFightingNoProbPigHeadShockedSinistersmileSlapSocialSweatTolaughWatermelonWittyWowYeahYellowdog
验证码
评论列表 (暂无评论,253人围观)

还没有评论,来说两句吧...

目录[+]

取消
微信二维码
微信二维码
支付宝二维码