python3 爬虫 爬取1024图片区,

python3 爬虫 爬取1024图片区,

接触python有风度翩翩段时间了,从来想写个爬虫,可是多年来周围年来末实在没什么时间,就做了个demo出来,不常会并发局地error,不过跑还能跑起来,下个几百张图片依旧没问题,剩下的难题猜想要到放假技术一蹴即至好了,先把代码放上来,以供调换,欢迎大家提出教导意见

跻身正题

自个儿写那个爬虫的时候参谋了纯洁的微笑的博客,思路基本大致,把他的那篇博客也贴出来:

自笔者的代码如下

from bs4 import BeautifulSoup

import re
import os
import requests
import json
import time

import OpenSSL

mainsite="http://1024的网址就不贴了.com/"
def getbs(url):
        header={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
                "Referer":"http://t66y.com//thread0806.php?fid=16&search=&page=1",
                "Host":"t66y.com"
                }
        req=requests.get(url,headers=header)
        req.encoding="gbk"#这里因为1024图片帖子内的编码是gbk,如果不指明编码,得到的是乱码
        bsobj = BeautifulSoup(req.text, "html5lib")
        return bsobj

def getallpage(start,end):
        urls=[]
        for i in range(start,end+1):
                url="http://地址打码/thread0806.php?fid=16&search=&page={}".format(str(i))
                bsobj=getbs(url)
                urls+=bsobj.find_all("a",{"href":re.compile("^htm_data.*")})
        return urls
def getpicofpage(url):
        bsobj=getbs(url)
        div=bsobj.find("div",{"class":"tpc_content do_not_catch"})
        if div==None:
                print("获取不到内容,跳过")
                return -1
        inputs=div.find_all("input")
        title=bsobj.find("h4").text
        if inputs==[]:
                print("本页无图片,跳过")
                return -1
        num=1
        if os.path.exists(path + "new\tupian\" + "\" + title)==False:
                os.mkdir(path + "new\tupian\" + "\" + title)
        else:
                print("已存在该文件夹,跳过")
                return -1
        for i in inputs:
                try:#问题主要出在这里
                        res = requests.get(i["src"],timeout=25)
                        with open(path +"new\tupian\"+"\"+title+"\"+str(time.time())[:10]+".jpg", 'wb') as f:
                                f.write(res.content)
                except requests.exceptions.Timeout:#爬图片时有的会超时,如果不设置超时,可能会一直卡在那里

                        print("已超时,跳过本页")
                        return -1
                except OpenSSL.SSL.WantReadError:#这里也是个问题,有的时候会跳出这个异常,但是我这里是捕捉不到的,这个异常到底是怎么回事,我还没弄清楚
                        print("OpenSSL.SSL.WantReadError,跳过")
                        return -1
                print(num)
                num+=1
l=getallpage(5,10)
page=1
ed=[]
for i in l:
        url=mainsite+i["href"]
        if url in ed:
                print(url+"本页已采集过,跳过")
                continue
        print(url)
        getpicofpage(url)
        ed.append(url)
        print("采集完第{}页".format(page))
        page+=1
        time.sleep(3)

 此外也把上面说的ssl非常贴出来:

 

Traceback (most recent call last):
File “D:pythonLibsite-packagesurllib3contribpyopenssl.py”,
line 441, in wrap_socket
cnx.do_handshake()
File “D:pythonLibsite-packagesOpenSSLSSL.py”, line 1806, in
do_handshake
self._raise_ssl_error(self._ssl, result)
File “D:pythonLibsite-packagesOpenSSLSSL.py”, line 1521, in
_raise_ssl_error
raise WantReadError()
OpenSSL.SSL.WantReadError
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File “D:pythonLibsite-packagesurllib3connectionpool.py”, line
595, in urlopen
self._prepare_proxy(conn)
File “D:pythonLibsite-packagesurllib3connectionpool.py”, line
816, in _prepare_proxy
conn.connect()
File “D:pythonLibsite-packagesurllib3connection.py”, line 326,
in connect
ssl_context=context)
File “D:pythonLibsite-packagesurllib3utilssl_.py”, line
329, in ssl_wrap_socket
return context.wrap_socket(sock, server_hostname=server_hostname)
File “D:pythonLibsite-packagesurllib3contribpyopenssl.py”,
line 445, in wrap_socket
raise timeout(‘select timed out’)
socket.timeout: select timed out
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File “D:pythonLibsite-packagesrequestsadapters.py”, line 440,
in send
timeout=timeout
File “D:pythonLibsite-packagesurllib3connectionpool.py”, line
639, in urlopen
_stacktrace=sys.exc_info()[2])
File “D:pythonLibsite-packagesurllib3utilretry.py”, line
388, in increment
raise MaxRetryError(_pool, url, error or ResponseError(cause))
urllib3.exceptions.MaxRetryError:
HTTPSConnectionPool(host=’www.srimg.com’, port=443): Max retries
exceeded with url: /u/20180104/11315126.jpg (Caused by
ProxyError(‘Cannot connect to proxy.’, timeout(‘select timed out’,)))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File “D:PyCharm 2017.3.1helperspydevpydev_run_in_console.py”,
line 52, in run_file
pydev_imports.execfile(file, globals, locals) # execute the script
File “D:PyCharm
2017.3.1helperspydev_pydev_imps_pydev_execfile.py”, line 18,
in execfile
exec(compile(contents+”n”, file, ‘exec’), glob, loc)
File “D:/learnPython/crawler/crawler.py”, line 301, in <module>
getpicofpage(url)
File “D:/learnPython/crawler/crawler.py”, line 281, in getpicofpage
res = requests.get(i[“src”],timeout=25)
File “D:pythonLibsite-packagesrequestsapi.py”, line 72, in
get
return request(‘get’, url, params=params, **kwargs)
File “D:pythonLibsite-packagesrequestsapi.py”, line 58, in
request
return session.request(method=method, url=url, **kwargs)
File “D:pythonLibsite-packagesrequestssessions.py”, line 508,
in request
resp = self.send(prep, **send_kwargs)
File “D:pythonLibsite-packagesrequestssessions.py”, line 618,
in send
r = adapter.send(request, **kwargs)
File “D:pythonLibsite-packagesrequestsadapters.py”, line 502,
in send
raise ProxyError(e, request=request)
requests.exceptions.ProxyError:
HTTPSConnectionPool(host=’www.srimg.com’, port=443): Max retries
exceeded with url: /u/20180104/11315126.jpg (Caused by
ProxyError(‘Cannot connect to proxy.’, timeout(‘select timed out’,)))
PyDev console: starting.

再有一点,就算本人开了vpn,不过一向爬是得到不到内容的,会唤起主机未有响应,不过后来发觉开了fiddler就能够爬了,测度是ip的由来,这几个小编还未细心推究,也请各位多都赐教

 

爬虫 爬取1024图片区,
接触python有生龙活虎段时间了,一贯想写个爬虫,但是多年来围拢期末实在没什么时间,就做了个demo出来,临时会…

发表评论