发布网友 发布时间:2022-04-21 06:02
共1个回答
热心网友 时间:2023-11-06 06:08
#之前用过如下代码爬过豆瓣用户,效率还可以#好像叫gevent协称模块来着,你可以试试。#!/usr/bin/env python #coding: utf-8 from selenium import webdriver from selenium.webdriver.phantomjs.service import Service as PhantomJSService from gevent import monkey from BeautifulSoup import BeautifulSoup monkey.patch_all() import gevent import sys import time def doJob(urls,name): service_args = [ ] browser = webdriver.PhantomJS(executable_path=r'D:\TestProject\phantomjs\bin\phantomjs.exe',service_args=service_args) wr = open('done/'+name+'.txt','w') for url in urls: browser.get(url) time.sleep(1) soup = BeautifulSoup(browser.page_source.encode('utf-8')) findNames = soup.findAll('div',attrs={'class':'name'}) if findNames is None: print url for sub in findNames: n = sub.a.string.encode('utf8') if sub.a.string is not None else '' pl = sub.span.string.encode('utf8') if sub.span.string is not None else '' wr.write(n+','+pl) wr.write('\n') wr.close() browser.quit() files = {'culture':[],'travel':[],'ent':[],'fashion':[],'life':[],'tech':[]} for key_fn in files: with open(key_fn + '.link','r') as f: files[key_fn] = f.read().split('\n') gevent.joinall([ gevent.spawn(doJob,files['culture'],'culture'), gevent.spawn(doJob,files['travel'],'travel'), gevent.spawn(doJob,files['ent'],'ent'), gevent.spawn(doJob,files['fashion'],'fashion'), gevent.spawn(doJob,files['life'],'life'), gevent.spawn(doJob,files['tech'],'tech'), ])追问你这是什么