# 爬虫学习笔记 **Repository Path**: anson_xu/crawler_-study ## Basic Information - **Project Name**: 爬虫学习笔记 - **Description**: 爬虫学习代码 - **Primary Language**: Python - **License**: Not specified - **Default Branch**: master - **Homepage**: None - **GVP Project**: No ## Statistics - **Stars**: 0 - **Forks**: 0 - **Created**: 2020-09-18 - **Last Updated**: 2020-12-19 ## Categories & Tags **Categories**: Uncategorized **Tags**: None ## README # Crawler_Study ### 爬虫学习笔记仓库 # urllib ## 1.Send the request response 的一个类型及6个方法 + 类型 + httpresponse + 方法 + read + readline + readlines + geturl + getheader + getcode ```python url = 'http://www.baidu.com' # response对象的类型是HTTPResponse response = urllib.request.urlopen(url) # 读取 response.read() # 读取一行 response.readline() # 读取全部,一行一行读取 response.readlines() # 获取资源路径 response.geturl # 获取响应头 response.getheaders() # 获取服务器响应的状态码 response.getcode() ``` ## 2.Download 使用`urlretrieve` 方法下载文件 ```python url = 'http://www.baidu.com' # filename 是下载完成之后,保存文件的名称 # 下载网页 urllib.request.urlretrieve(url=url, filename='baidu.html') # 下载图片 url_image = 'https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1600269114178&di=fca4fcb8c0bfa8057b5b593a6e78487e&imgtype=0&src=http%3A%2F%2F00.imgmini.eastday.com%2Fmobile%2F20170927%2F20170927151425_d41d8cd98f00b204e9800998ecf8427e_4.jpeg' urllib.request.urlretrieve(url=url_image, filename='../zms.jpg') ``` ## 3.请求对象的定制 ```python url = 'http:www.baidu.com' header = { 'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50' } # 模拟浏览器向服务区发送请求,但是有部分网站会检测UA # 如果请求中没有UA 那就不会返回数据 # 请求对象的定制 # 请求对象的定制 就是将请求资源路径 请求参数 请求头放在一起 request = urllib.requset.Request(url=url, headers=headers) response = urllib.request.urlopen(request) ``` ## 4.编码 百度搜索的关键字中需要对汉字进行编码 使用`parse.quote` 函数进行编码 ```python import urllib.request import urllib.parse # 爬虫的编码 # s = '周杰伦' # 会将汉语进行编码 a = urllib.parse.quote(s) print(a) # 浏览器会自动编解码 pycharm不会自动编解码 url = 'https://www.baidu.com/s?wd=' key_word = '周杰伦' code_key_word = urllib.parse.quote(key_word) url = url + code_key_word print(url) headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', 'Cookie': 'BIDUPSID=050884BF79605B3FE5B441130814BD2B; PSTM=1600135523; BAIDUID=050884BF79605B3FB4E3B4F336CDFE50:FG=1; BD_UPN=12314753; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; sug=3; sugstore=1; ORIGIN=0; bdime=0; BDUSS=0J0QnVyV2lMMFNEMEtmRWpRLWdTWlZ4ZzI5Wkt0YXpKZERVNUhWcEE2MGJGb2hmRVFBQUFBJCQAAAAAAAAAAAEAAAAPfeKmsK7C~cXctcTO2rnqMwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABuJYF8biWBfT; BDUSS_BFESS=0J0QnVyV2lMMFNEMEtmRWpRLWdTWlZ4ZzI5Wkt0YXpKZERVNUhWcEE2MGJGb2hmRVFBQUFBJCQAAAAAAAAAAAEAAAAPfeKmsK7C~cXctcTO2rnqMwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABuJYF8biWBfT; BD_HOME=1; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; delPer=0; BD_CK_SAM=1; PSINO=3; COOKIE_SESSION=8131_0_8_0_15_7_1_0_5_4_2_3_0_0_1_0_1600250890_0_1600259020%7C9%230_0_1600259020%7C1; BDRCVFR[dG2JNJb_ajR]=mk3SLVN4HKm; BDRCVFR[-pGxjrCMryR]=mk3SLVN4HKm; BDRCVFR[tox4WRQ4-Km]=mk3SLVN4HKm; BDRCVFR[CLK3Lyfkr9D]=mk3SLVN4HKm; H_PS_645EC=2095Q1TdarCNVRr9ilYOnHk51HYr7VOda%2B%2B5r2jTHRuMltuw3coHrV%2FI%2Fyg; H_PS_PSSID=7540_32617_1439_31253_7609_7552_7608_32116_32718_22160; BDSVRTM=0' } request = urllib.request.Request(url=url, headers=headers) response = urllib.request.urlopen(request) print(response.read().decode('utf-8')) ``` ## 5.对多个参数进行编码 使用`parse.urlencode` 函数对多个参数进行编码 ```python # https://www.baidu.com/s?wd=韩红&sex=女 # 多个参数进行编码 data = { 'wd': '韩红', 'sex': '女' } data = urllib.parse.urlencode(data) print(data) # 应用 url = 'https://www.baidu.com/s?' headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', 'Cookie': 'BIDUPSID=050884BF79605B3FE5B441130814BD2B; PSTM=1600135523; BAIDUID=050884BF79605B3FB4E3B4F336CDFE50:FG=1; BD_UPN=12314753; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; sug=3; sugstore=1; ORIGIN=0; bdime=0; BDUSS=0J0QnVyV2lMMFNEMEtmRWpRLWdTWlZ4ZzI5Wkt0YXpKZERVNUhWcEE2MGJGb2hmRVFBQUFBJCQAAAAAAAAAAAEAAAAPfeKmsK7C~cXctcTO2rnqMwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABuJYF8biWBfT; BDUSS_BFESS=0J0QnVyV2lMMFNEMEtmRWpRLWdTWlZ4ZzI5Wkt0YXpKZERVNUhWcEE2MGJGb2hmRVFBQUFBJCQAAAAAAAAAAAEAAAAPfeKmsK7C~cXctcTO2rnqMwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABuJYF8biWBfT; BD_HOME=1; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; delPer=0; BD_CK_SAM=1; PSINO=3; COOKIE_SESSION=8131_0_8_0_15_7_1_0_5_4_2_3_0_0_1_0_1600250890_0_1600259020%7C9%230_0_1600259020%7C1; BDRCVFR[dG2JNJb_ajR]=mk3SLVN4HKm; BDRCVFR[-pGxjrCMryR]=mk3SLVN4HKm; BDRCVFR[tox4WRQ4-Km]=mk3SLVN4HKm; BDRCVFR[CLK3Lyfkr9D]=mk3SLVN4HKm; H_PS_645EC=2095Q1TdarCNVRr9ilYOnHk51HYr7VOda%2B%2B5r2jTHRuMltuw3coHrV%2FI%2Fyg; H_PS_PSSID=7540_32617_1439_31253_7609_7552_7608_32116_32718_22160; BDSVRTM=0' } data = { 'wd': '韩红', 'sex': '女' } # 编码 data = urllib.parse.urlencode(data) url = url + data # 请求对象的定制 request = urllib.request.Request(url=url, headers=headers) # 获取响应 response = urllib.request.urlopen(request) print(response.read().decode('utf-8') ``` 总结: + url data headers + request + response + content ## 6.Post 请求 ```python import urllib.request import utllib.parse url = 'https://fanyi.baidu.com/sug' data = { 'kw': 'car' } # post请求必须要进行编码并且进行encode方法的调用 data = urllib.parse.urlencode(data).encode('utf-8') headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', 'Cookie': 'BIDUPSID=050884BF79605B3FE5B441130814BD2B; PSTM=1600135523; BAIDUID=050884BF79605B3FB4E3B4F336CDFE50:FG=1; BD_UPN=12314753; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; sug=3; sugstore=1; ORIGIN=0; bdime=0; BDUSS=0J0QnVyV2lMMFNEMEtmRWpRLWdTWlZ4ZzI5Wkt0YXpKZERVNUhWcEE2MGJGb2hmRVFBQUFBJCQAAAAAAAAAAAEAAAAPfeKmsK7C~cXctcTO2rnqMwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABuJYF8biWBfT; BDUSS_BFESS=0J0QnVyV2lMMFNEMEtmRWpRLWdTWlZ4ZzI5Wkt0YXpKZERVNUhWcEE2MGJGb2hmRVFBQUFBJCQAAAAAAAAAAAEAAAAPfeKmsK7C~cXctcTO2rnqMwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABuJYF8biWBfT; BD_HOME=1; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; delPer=0; BD_CK_SAM=1; PSINO=3; COOKIE_SESSION=8131_0_8_0_15_7_1_0_5_4_2_3_0_0_1_0_1600250890_0_1600259020%7C9%230_0_1600259020%7C1; BDRCVFR[dG2JNJb_ajR]=mk3SLVN4HKm; BDRCVFR[-pGxjrCMryR]=mk3SLVN4HKm; BDRCVFR[tox4WRQ4-Km]=mk3SLVN4HKm; BDRCVFR[CLK3Lyfkr9D]=mk3SLVN4HKm; H_PS_645EC=2095Q1TdarCNVRr9ilYOnHk51HYr7VOda%2B%2B5r2jTHRuMltuw3coHrV%2FI%2Fyg; H_PS_PSSID=7540_32617_1439_31253_7609_7552_7608_32116_32718_22160; BDSVRTM=0' } # post请求参数 需要请求对象的定制 post请求的参数不需要和url进行拼接 # 会自动将参数添加到请求体中 request = urllib.request.Request(url=url, headers=headers, data=data) response = urllob.request.urlopen(request) content = response.read().decode('utf-8') print(content) ```