socket解析HTTP请求内容
思路
1. 解析HTTP请求的头部
HTTP请求头部的结束符行为"\r\n",可以按行读取HTTP请求头的内容,如果读到一行为"\r\n",说明HTTP请求头结束。
2. 请求头里面含有Content-Length参数
如果HTTP请求里面有Content-Length参数,说明HTTP请求的内容大小是确定的,请求直接读取Content-Length的值,然后读取相应字节的的内容即可。
3. 请求头里面含有Transfer-Encoding: chunked 参数
如果HTTP请求里面有Transfer-Encoding参数,说明HTTP请求的内容大小是不确定的,这种内容的结束符是"0\r\n\r\n",因此可以按行读取HTTP请求的内容部分,如果连续读到"0\r\n"和"\r\n"说明内容读取完毕。
代码实现
代码中: self._file 代表的是socket.makefile()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
|
def get_http_content( self ): content_length = 0 transfer_encoding = False while True : req_line = self ._file.readline() req_line = str (req_line, "utf-8" ) # 遇到http头结束符 # 读取http内容 if req_line = = "\r\n" : if content_length ! = 0 : content = self ._file.read(content_length) content = str (content, "utf-8" ) self ._content = content return None if transfer_encoding: content = "" self ._file.readline() while True : line = self ._file.readline() line = str (line, "utf-8" ) if line = = "0\r\n" : sub_line = self ._file.readline() sub_line = str (sub_line, "utf-8" ) if sub_line = = "\r\n" : self ._content = content return None else : content + = line continue self ._content = False # 头文件没有结束 # 并且没有找到关于内容大小的字段 else : if content_length = = 0 and transfer_encoding is False : words = req_line.split() if words[ 0 ] = = "Content-Length:" : content_length = int (words[ 1 ]) if words[ 0 ] = = "Transfer-Encoding:" : transfer_encoding = True self ._content = False |
socket 模拟http请求
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
|
# coding: utf-8 import socket from urllib.parse import urlparse def get_url(url): url = urlparse(url) host = url.netloc path = url.path if path = = "": path = "/" # 建立 socket 连接 client = socket.socket(socket.AF_INET, socket.SOCK_STREAM) client.connect((host, 80 )) client.send( "GET {} HTTP/1.1\r\nHost:{}\r\nConnection:close\r\n\r\n" . format (path, host).encode( "utf-8" )) data = b"" while True : d = client.recv( 1024 ) if d: data + = d else : break data = data.decode( "utf-8" ) html_data = data.split( "\r\n\r\n" )[ 1 ] print (html_data) client.close() pass if __name__ = = '__main__' : get_url( "http://www.baidu.com" ) |
以上为个人经验,希望能给大家一个参考,也希望大家多多支持服务器之家。
原文链接:https://blog.csdn.net/m0_37954775/article/details/100114334