爬虫4.C语言写一个支持http与https下载的客户端
C语言写一个支持http与https下载的客户端
准备几个http网址
技术成就梦想51CTO-中国知名的数字化人才学习平台和技术社区
一、用C语言写一个windows客户端访问使用http协议的网站下载单个文件
1. 大概思路
- 思路为发送Get请求,
- 接收响应,
- 打印响应头 ,
- 将网页源码写入一个html文件中,
2.代码实现
main.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
int main() {
const char* url = "http://example.com"; // 替换为您的URL
const char* output_file = "example.html"; // 替换为输出文件名
int port = 80; // 默认HTTP端口
char host[256], path[256] = "";
sscanf(url, "http://%255[^/]/%255[^\n]", host, path);
// 初始化Winsock和创建套接字
initialize_winsock();
SOCKET sockfd = create_socket();
// 解析主机名并获取服务器地址
struct sockaddr_in server_addr;
resolve_hostname(host, port, &server_addr);
// 连接服务器
if (connect(sockfd, (struct sockaddr*)&server_addr, sizeof(server_addr)) < 0) {
printf("连接服务器失败\n");
closesocket(sockfd);
WSACleanup();
return 1;
}
// 发送HTTP请求并接收响应
send_http_request(sockfd, host, path);
receive_http_response(sockfd, output_file);
// 清理Winsock
WSACleanup();
return 0;
}http_client_utils.h
1
2
3
4
5
6
7
8
9
10
11
12
13// http_client_utils.h
void initialize_winsock();
SOCKET create_socket();
void resolve_hostname(const char* hostname, int port, struct sockaddr_in* server_addr);
void send_http_request(SOCKET sockfd, const char* host, const char* path);
void receive_http_response(SOCKET sockfd, const char* output_file);http_client_utils.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
void initialize_winsock() {
WSADATA wsaData;
if (WSAStartup(MAKEWORD(2, 2), &wsaData) != 0) {
printf("Winsock初始化失败\n");
exit(1);
}
}
SOCKET create_socket() {
SOCKET sockfd = socket(AF_INET, SOCK_STREAM, 0);
if (sockfd == INVALID_SOCKET) {
printf("创建套接字失败\n");
WSACleanup();
exit(1);
}
return sockfd;
}
void resolve_hostname(const char* hostname, int port, struct sockaddr_in* server_addr) {
struct addrinfo hints, * res;
char port_str[6];
snprintf(port_str, sizeof(port_str), "%d", port);
memset(&hints, 0, sizeof(hints));
hints.ai_family = AF_INET;
hints.ai_socktype = SOCK_STREAM;
if (getaddrinfo(hostname, port_str, &hints, &res) != 0) {
printf("无法解析主机名\n");
WSACleanup();
exit(1);
}
memcpy(server_addr, res->ai_addr, sizeof(struct sockaddr_in));
freeaddrinfo(res);
}
void send_http_request(SOCKET sockfd, const char* host, const char* path) {
char request[4096];
snprintf(request, sizeof(request), "GET /%s HTTP/1.1\r\nHost: %s\r\nConnection: close\r\n\r\n", path, host);
send(sockfd, request, strlen(request), 0);
}
void receive_http_response(SOCKET sockfd, const char* output_file) {
FILE* file = fopen(output_file, "wb");
if (!file) {
printf("打开输出文件失败\n");
closesocket(sockfd);
return;
}
char buffer[4096];
int bytes_received;
bool header_passed = false;
while ((bytes_received = recv(sockfd, buffer, sizeof(buffer), 0)) > 0) {
if (!header_passed) {
// 打印并跳过HTTP响应头
char* header_end = strstr(buffer, "\r\n\r\n");
if (header_end) {
*header_end = '\0';
printf("HTTP响应头:\n%s\n", buffer);
header_end += 4; // 跳过"\r\n\r\n"
bytes_received -= (header_end - buffer);
memmove(buffer, header_end, bytes_received);
header_passed = true;
}
else {
// 尚未完整接收到HTTP头,继续接收
continue;
}
}
fwrite(buffer, 1, bytes_received, file);
}
if (bytes_received == 0) {
printf("文件下载成功:%s\n", output_file);
}
else if (bytes_received < 0) {
printf("接收HTTP响应时出错:%d\n", WSAGetLastError());
}
fclose(file);
closesocket(sockfd);
}编译运行crawlerc.exe。
查看example.html下载成功。
二、修改代码,接收所有的请求并下载网页源码与所有文件。
1. 大概思路
- 使用tcp连接接
- 思路为发送Get请求,
- 接收响应,
- 解析响应头并打印,
- 将网页源码写入一个html文件中,
- 解析此网页中所有的链接,
- 关闭TCP连接
- 对着解析后的链接发送TCP 连接请求,
- 对着解析后的链接发送Get请求,
- 接收响应,
- 解析响应头并打印,
- 将数据写入到同名文件中。
- 关闭TCP连接
- 循环发送连接下载文件。
2.代码实现
main.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47// main.c
int main() {
const char* url = "http://books.toscrape.com/"; // 替换为您的URL
const char* output_dir = "downloaded_files"; // 新建文件夹保存下载的文件
const char* output_file = "downloaded_files/toscrape.html"; // 替换为输出文件名
int port = 80; // 默认HTTP端口
char host[256], path[256] = "";
sscanf_s(url, "http://%255[^/]/%255[^\n]", host, (unsigned int)sizeof(host), path, (unsigned int)sizeof(path));
// 初始化Winsock和创建套接字
initialize_winsock();
SOCKET sockfd = create_socket();
// 解析主机名并获取服务器地址
struct sockaddr_in server_addr;
resolve_hostname(host, port, &server_addr);
// 连接服务器
if (connect(sockfd, (struct sockaddr*)&server_addr, sizeof(server_addr)) < 0) {
printf("连接服务器失败\n");
closesocket(sockfd);
WSACleanup();
return 1;
}
// 发送HTTP请求并接收响应
char* html_content = NULL;
send_http_request(sockfd, host, path);
receive_http_response(sockfd, output_file, &html_content);
// 下载附加文件
if (html_content) {
download_additional_files(html_content, "http://books.toscrape.com", output_dir);
free(html_content);
}
// 清理Winsock
WSACleanup();
return 0;
}http_client_utils.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17// http_client_utils.h
void initialize_winsock();
SOCKET create_socket();
void resolve_hostname(const char* hostname, int port, struct sockaddr_in* server_addr);
void send_http_request(SOCKET sockfd, const char* host, const char* path);
void receive_http_response(SOCKET sockfd, const char* output_file, char** html_content);
bool is_absolute_url(const char* url);
void download_additional_files(const char* html_content, const char* base_url, const char* output_dir);http_client_utils.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211// http_client_utils.c
void initialize_winsock() {
WSADATA wsaData;
if (WSAStartup(MAKEWORD(2, 2), &wsaData) != 0) {
printf("Winsock初始化失败\n");
exit(1);
}
}
SOCKET create_socket() {
SOCKET sockfd = socket(AF_INET, SOCK_STREAM, 0);
if (sockfd == INVALID_SOCKET) {
printf("创建套接字失败\n");
WSACleanup();
exit(1);
}
return sockfd;
}
void resolve_hostname(const char* hostname, int port, struct sockaddr_in* server_addr) {
struct addrinfo hints, * res;
char port_str[6];
snprintf(port_str, sizeof(port_str), "%d", port);
memset(&hints, 0, sizeof(hints));
hints.ai_family = AF_INET;
hints.ai_socktype = SOCK_STREAM;
if (getaddrinfo(hostname, port_str, &hints, &res) != 0) {
printf("无法解析主机名\n");
WSACleanup();
exit(1);
}
memcpy(server_addr, res->ai_addr, sizeof(struct sockaddr_in));
freeaddrinfo(res);
}
void send_http_request(SOCKET sockfd, const char* host, const char* path) {
char request[4096];
snprintf(request, sizeof(request), "GET /%s HTTP/1.1\r\nHost: %s\r\nConnection: close\r\n\r\n", path, host);
send(sockfd, request, strlen(request), 0);
}
void receive_http_response(SOCKET sockfd, const char* output_file, char** html_content) {
// 检查文件夹是否存在,如果不存在则创建
char output_folder[512];
strncpy_s(output_folder, sizeof(output_folder), output_file, _TRUNCATE);
char* last_slash = strrchr(output_folder, '/');
if (last_slash) {
*last_slash = '\0'; // 终止字符串
_mkdir(output_folder); // 创建文件夹
*last_slash = '/'; // 恢复字符串
}
FILE* file = fopen(output_file, "wb");
if (!file) {
printf("打开输出文件失败:%s\n", output_file);
closesocket(sockfd);
return;
}
char buffer[4096];
int bytes_received;
bool header_passed = false;
size_t content_length = 0;
char* content = NULL;
while ((bytes_received = recv(sockfd, buffer, sizeof(buffer), 0)) > 0) {
if (!header_passed) {
// 查找并打印HTTP响应头
char* header_end = strstr(buffer, "\r\n\r\n");
if (header_end) {
*header_end = '\0'; // 终止头部字符串
printf("HTTP响应头:\n%s\n", buffer);
// 恢复HTTP响应体并写入文件
header_end += 4;
bytes_received -= (header_end - buffer);
memmove(buffer, header_end, bytes_received);
header_passed = true;
}
else {
// 响应头尚未完全接收
continue;
}
}
fwrite(buffer, 1, bytes_received, file);
// 存储响应体内容
content = realloc(content, content_length + bytes_received + 1);
if (!content) {
printf("内存分配失败\n");
fclose(file);
closesocket(sockfd);
return;
}
memcpy(content + content_length, buffer, bytes_received);
content_length += bytes_received;
}
if (bytes_received == 0) {
printf("文件下载成功:%s\n", output_file);
}
else if (bytes_received < 0) {
printf("接收HTTP响应时出错:%d\n", WSAGetLastError());
}
content[content_length] = '\0';
*html_content = content;
fclose(file);
closesocket(sockfd);
}
bool is_absolute_url(const char* url) {
return strstr(url, "http://") == url || strstr(url, "https://") == url;
}
void download_additional_files(const char* html_content, const char* base_url, const char* output_dir) {
const char* link_start;
const char* cursor = html_content;
// 创建下载文件夹
_mkdir(output_dir);
while ((link_start = strstr(cursor, "href=\"")) || (link_start = strstr(cursor, "src=\""))) {
if ((link_start = strstr(cursor, "href=\"")) && (strstr(cursor, "src=\"") == NULL || link_start < strstr(cursor, "src=\""))) {
link_start += 6; // 跳过 "href=\""
}
else if ((link_start = strstr(cursor, "src=\""))) {
link_start += 5; // 跳过 "src=\""
}
else {
break;
}
const char* link_end = strchr(link_start, '"');
if (!link_end) {
break;
}
size_t link_length = link_end - link_start;
char* link = malloc(link_length + 1);
if (!link) {
printf("内存分配失败\n");
return;
}
strncpy_s(link, link_length + 1, link_start, link_length);
// 确定完整的下载 URL
char full_url[4096];
if (is_absolute_url(link)) {
strncpy_s(full_url, sizeof(full_url), link, _TRUNCATE);
}
else {
// 去掉基URL中的末尾斜杠,防止双斜杠
size_t base_url_length = strlen(base_url);
if (base_url[base_url_length - 1] == '/') {
base_url_length--;
}
snprintf(full_url, sizeof(full_url), "%.*s/%s", (int)base_url_length, base_url, link);
}
printf("下载链接: %s\n", full_url);
// 下载文件
char host[256], path[256] = "";
sscanf_s(full_url, "http://%255[^/]/%255[^\n]", host, (unsigned int)sizeof(host), path, (unsigned int)sizeof(path));
initialize_winsock();
SOCKET sockfd = create_socket();
struct sockaddr_in server_addr;
resolve_hostname(host, 80, &server_addr);
if (connect(sockfd, (struct sockaddr*)&server_addr, sizeof(server_addr)) < 0) {
printf("连接服务器失败\n");
closesocket(sockfd);
WSACleanup();
free(link);
continue;
}
send_http_request(sockfd, host, path);
// 创建保存路径
char output_file[512];
snprintf(output_file, sizeof(output_file), "%s/%s", output_dir, link);
// 确保目录存在(如果需要可以递归创建子目录)
for (char* p = output_file; *p; ++p) {
if (*p == '/') {
*p = '\0';
_mkdir(output_file);
*p = '/';
}
}
char* dummy_content;
receive_http_response(sockfd, output_file, &dummy_content);
free(link);
cursor = link_end;
}
}编译运行crawlerc.exe。
查看文件夹下载成功。
三、修改代码,接收所有的请求并下载所有jpg格式的文件。
1. 大概思路
- TCP连接
- 发送Get请求,
- 接收响应,
- 解析响应头并打印,
- 将网页源码写入一个html文件中,
- 解析此网页中所有的图片 链接,
- 保存所有链接,
- 关闭TCP连接
- 开启 TCP连接
- 对着解析后的链接发送Get请求,
- 接收响应,
- 解析响应头并打印,
- 将数据写入到同名文件中,
- 关闭TCP连接
- 循环发送TCP与Get请求下载所有文件,
2.代码实现
main.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
int main() {
const char* url = "http://books.toscrape.com/"; // 替换为您的URL
const char* output_dir = "downloaded_files"; // 新建文件夹保存下载的文件
const char* output_file = "downloaded_files/toscrape.html"; // 替换为输出文件名
int port = 80; // 默认HTTP端口
char host[256], path[256] = "";
sscanf_s(url, "http://%255[^/]/%255[^\n]", host, (unsigned int)sizeof(host), path, (unsigned int)sizeof(path));
// 初始化Winsock和创建套接字
initialize_winsock();
SOCKET sockfd = create_socket();
// 声明 server_addr 变量
struct sockaddr_in server_addr;
// 解析主机名并获取服务器地址
resolve_hostname(host, port, &server_addr);
// 连接服务器
if (connect(sockfd, (struct sockaddr*)&server_addr, sizeof(server_addr)) < 0) {
printf("连接服务器失败\n");
closesocket(sockfd);
WSACleanup();
return 1;
}
// 发送HTTP请求并接收响应
char* html_content = NULL;
send_http_request(sockfd, host, path);
receive_http_response(sockfd, output_file, &html_content);
// 提取JPG图片链接
char** links = NULL;
int num_links = 0;
if (html_content) {
extract_jpg_links(html_content, "http://books.toscrape.com", &links, &num_links);
free(html_content);
}
// 批量下载JPG文件
if (num_links > 0) {
download_jpg_files((const char**)links, num_links, output_dir);
for (int i = 0; i < num_links; i++) {
free(links[i]);
}
free(links);
}
// 清理Winsock
WSACleanup();
return 0;
}http_client_utils.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
void initialize_winsock();
SOCKET create_socket();
void resolve_hostname(const char* hostname, int port, struct sockaddr_in* server_addr);
void send_http_request(SOCKET sockfd, const char* host, const char* path);
void receive_http_response(SOCKET sockfd, const char* output_file, char** html_content);
bool is_absolute_url(const char* url);
void extract_jpg_links(const char* html_content, const char* base_url, char** links[], int* num_links);
void download_jpg_files(const char* links[], int num_links, const char* output_dir);http_client_utils.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
void initialize_winsock() {
WSADATA wsaData;
if (WSAStartup(MAKEWORD(2, 2), &wsaData) != 0) {
printf("Winsock初始化失败\n");
exit(1);
}
}
SOCKET create_socket() {
SOCKET sockfd = socket(AF_INET, SOCK_STREAM, 0);
if (sockfd == INVALID_SOCKET) {
printf("创建套接字失败\n");
WSACleanup();
exit(1);
}
return sockfd;
}
void resolve_hostname(const char* hostname, int port, struct sockaddr_in* server_addr) {
struct addrinfo hints, * res;
char port_str[6];
snprintf(port_str, sizeof(port_str), "%d", port);
memset(&hints, 0, sizeof(hints));
hints.ai_family = AF_INET;
hints.ai_socktype = SOCK_STREAM;
if (getaddrinfo(hostname, port_str, &hints, &res) != 0) {
printf("无法解析主机名: %s\n", hostname);
WSACleanup();
exit(1);
}
memcpy(server_addr, res->ai_addr, sizeof(struct sockaddr_in));
freeaddrinfo(res);
}
void send_http_request(SOCKET sockfd, const char* host, const char* path) {
char request[4096];
snprintf(request, sizeof(request), "GET /%s HTTP/1.1\r\nHost: %s\r\nConnection: close\r\n\r\n", path, host);
send(sockfd, request, strlen(request), 0);
}
void receive_http_response(SOCKET sockfd, const char* output_file, char** html_content) {
// 检查文件夹是否存在,如果不存在则创建
char output_folder[512];
strncpy_s(output_folder, sizeof(output_folder), output_file, _TRUNCATE);
char* last_slash = strrchr(output_folder, '/');
if (last_slash) {
*last_slash = '\0'; // 终止字符串
_mkdir(output_folder); // 创建文件夹
*last_slash = '/'; // 恢复字符串
}
FILE* file = fopen(output_file, "wb");
if (!file) {
printf("打开输出文件失败:%s\n", output_file);
closesocket(sockfd);
return;
}
char buffer[4096];
int bytes_received;
bool header_passed = false;
size_t content_length = 0;
char* content = NULL;
while ((bytes_received = recv(sockfd, buffer, sizeof(buffer), 0)) > 0) {
if (!header_passed) {
// 查找并打印HTTP响应头
char* header_end = strstr(buffer, "\r\n\r\n");
if (header_end) {
*header_end = '\0'; // 终止头部字符串
printf("HTTP响应头:\n%s\n", buffer);
// 恢复HTTP响应体并写入文件
header_end += 4;
bytes_received -= (header_end - buffer);
memmove(buffer, header_end, bytes_received);
header_passed = true;
}
else {
// 响应头尚未完全接收
continue;
}
}
fwrite(buffer, 1, bytes_received, file);
// 存储响应体内容
content = realloc(content, content_length + bytes_received + 1);
if (!content) {
printf("内存分配失败\n");
fclose(file);
closesocket(sockfd);
return;
}
memcpy(content + content_length, buffer, bytes_received);
content_length += bytes_received;
}
if (bytes_received == 0) {
printf("文件下载成功:%s\n", output_file);
}
else if (bytes_received < 0) {
printf("接收HTTP响应时出错:%d\n", WSAGetLastError());
}
content[content_length] = '\0';
*html_content = content;
fclose(file);
closesocket(sockfd);
}
bool is_absolute_url(const char* url) {
return strstr(url, "http://") == url || strstr(url, "https://") == url;
}
void extract_jpg_links(const char* html_content, const char* base_url, char** links[], int* num_links) {
const char* link_start;
const char* cursor = html_content;
// 分配初始链接数组
*links = malloc(10 * sizeof(char*));
*num_links = 0;
int links_capacity = 10;
while ((link_start = strstr(cursor, "src=\""))) {
link_start += 5; // 跳过 "src=\""
const char* link_end = strchr(link_start, '"');
if (!link_end) {
break;
}
size_t link_length = link_end - link_start;
char* link = malloc(link_length + 1);
if (!link) {
printf("内存分配失败\n");
return;
}
strncpy_s(link, link_length + 1, link_start, link_length);
// 仅提取JPG文件链接
if (strstr(link, ".jpg") || strstr(link, ".jpeg")) {
// 确定完整的下载 URL
char full_url[4096];
if (is_absolute_url(link)) {
strncpy_s(full_url, sizeof(full_url), link, _TRUNCATE);
}
else {
// 去掉基URL中的末尾斜杠,防止双斜杠
size_t base_url_length = strlen(base_url);
if (base_url[base_url_length - 1] == '/') {
base_url_length--;
}
snprintf(full_url, sizeof(full_url), "%.*s/%s", (int)base_url_length, base_url, link);
}
// 将链接添加到数组中
if (*num_links >= links_capacity) {
links_capacity *= 2;
*links = realloc(*links, links_capacity * sizeof(char*));
}
(*links)[*num_links] = _strdup(full_url);
(*num_links)++;
}
free(link);
cursor = link_end;
}
}
void download_jpg_files(const char* links[], int num_links, const char* output_dir) {
// 创建下载文件夹
_mkdir(output_dir);
// 批量下载图片
for (int i = 0; i < num_links; i++) {
printf("下载链接: %s\n", links[i]);
// 提取主机名和路径
char host[256], path[256] = "";
sscanf_s(links[i], "http://%255[^/]/%255[^\n]", host, (unsigned int)sizeof(host), path, (unsigned int)sizeof(path));
initialize_winsock();
SOCKET sockfd = create_socket();
struct sockaddr_in server_addr;
resolve_hostname(host, 80, &server_addr);
if (connect(sockfd, (struct sockaddr*)&server_addr, sizeof(server_addr)) < 0) {
printf("连接服务器失败\n");
closesocket(sockfd);
WSACleanup();
continue;
}
send_http_request(sockfd, host, path);
// 创建保存路径,只保存在output_dir文件夹中,不保留原始路径结构
char output_file[512];
snprintf(output_file, sizeof(output_file), "%s/%s", output_dir, strrchr(path, '/') + 1);
FILE* file = fopen(output_file, "wb");
if (!file) {
printf("打开输出文件失败:%s\n", output_file);
}
else {
char* dummy_content;
receive_http_response(sockfd, output_file, &dummy_content);
fclose(file);
}
}
}编译运行crawlerc.exe。
查看文件夹下载成功。
四、修改代码,支持hppts协议 并使用User-Agent请求头假装正常浏览器访问并下载文件
1. 大概思路
- 思路为发送Get请求,
- 接收响应,
- 打印响应头 ,
- 将接受到的数据写入文件中,
2.代码实现
编写代码访问https://s5.51cto.com/oss/202408/30/a7a3092691d8f3fdb3322730c0fba80fd82f85.png并下载html文件。
main.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
int main() {
const char* url = "https://s5.51cto.com/oss/202408/30/a7a3092691d8f3fdb3322730c0fba80fd82f85.png"; // 替换为您的URL
const char* output_file = "f85.png"; // 替换为输出文件名
int port = 443; // 默认HTTPS端口
char host[256], path[256] = "";
sscanf(url, "https://%255[^/]/%255[^\n]", host, path);
// 初始化Winsock和创建套接字
initialize_winsock();
SOCKET sockfd = create_socket();
// 解析主机名并获取服务器地址
struct sockaddr_in server_addr;
resolve_hostname(host, port, &server_addr);
// 连接服务器
if (connect(sockfd, (struct sockaddr*)&server_addr, sizeof(server_addr)) < 0) {
printf("连接服务器失败\n");
closesocket(sockfd);
WSACleanup();
return 1;
}
// 创建SSL上下文并建立SSL连接
SSL_CTX* ctx = create_ssl_context();
SSL* ssl = connect_ssl(ctx, sockfd);
// 发送HTTP请求并接收响应
send_http_request(ssl, host, path);
receive_http_response(ssl, output_file);
// 清理SSL
SSL_CTX_free(ctx);
ERR_free_strings();
// 清理Winsock
WSACleanup();
return 0;
}http_client_utils.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
void initialize_winsock();
SOCKET create_socket();
void resolve_hostname(const char* hostname, int port, struct sockaddr_in* server_addr);
SSL_CTX* create_ssl_context();
SSL* connect_ssl(SSL_CTX* ctx, SOCKET sockfd);
void send_http_request(SSL* ssl, const char* host, const char* path);
void receive_http_response(SSL* ssl, const char* output_file);http_client_utils.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
void initialize_winsock() {
WSADATA wsaData;
if (WSAStartup(MAKEWORD(2, 2), &wsaData) != 0) {
printf("Winsock初始化失败\n");
exit(1);
}
}
SOCKET create_socket() {
SOCKET sockfd = socket(AF_INET, SOCK_STREAM, 0);
if (sockfd == INVALID_SOCKET) {
printf("创建套接字失败\n");
WSACleanup();
exit(1);
}
return sockfd;
}
void resolve_hostname(const char* hostname, int port, struct sockaddr_in* server_addr) {
struct addrinfo hints, * res;
char port_str[6];
snprintf(port_str, sizeof(port_str), "%d", port);
memset(&hints, 0, sizeof(hints));
hints.ai_family = AF_INET;
hints.ai_socktype = SOCK_STREAM;
if (getaddrinfo(hostname, port_str, &hints, &res) != 0) {
printf("无法解析主机名\n");
WSACleanup();
exit(1);
}
memcpy(server_addr, res->ai_addr, sizeof(struct sockaddr_in));
freeaddrinfo(res);
}
SSL_CTX* create_ssl_context() {
SSL_CTX* ctx;
SSL_load_error_strings();
OpenSSL_add_ssl_algorithms();
ctx = SSL_CTX_new(TLS_client_method());
if (!ctx) {
printf("无法创建SSL上下文\n");
ERR_print_errors_fp(stderr);
exit(1);
}
return ctx;
}
SSL* connect_ssl(SSL_CTX* ctx, SOCKET sockfd) {
SSL* ssl = SSL_new(ctx);
if (!ssl) {
printf("无法创建SSL结构\n");
ERR_print_errors_fp(stderr);
exit(1);
}
SSL_set_fd(ssl, sockfd);
if (SSL_connect(ssl) <= 0) {
printf("无法建立SSL连接\n");
ERR_print_errors_fp(stderr);
exit(1);
}
return ssl;
}
void send_http_request(SSL* ssl, const char* host, const char* path) {
char request[4096];
snprintf(request, sizeof(request), "GET /%s HTTP/1.1\r\n"
"Host: %s\r\n"
"Connection: close\r\n"
"User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3\r\n\r\n",
path, host);
SSL_write(ssl, request, strlen(request));
}
void receive_http_response(SSL* ssl, const char* output_file) {
FILE* file = fopen(output_file, "wb");
if (!file) {
printf("打开输出文件失败\n");
SSL_free(ssl);
return;
}
char buffer[4096];
int bytes_received;
bool header_passed = false;
while ((bytes_received = SSL_read(ssl, buffer, sizeof(buffer))) > 0) {
if (!header_passed) {
// 打印并跳过HTTP响应头
char* header_end = strstr(buffer, "\r\n\r\n");
if (header_end) {
*header_end = '\0';
printf("HTTP响应头:\n%s\n", buffer);
header_end += 4; // 跳过"\r\n\r\n"
bytes_received -= (header_end - buffer);
memmove(buffer, header_end, bytes_received);
header_passed = true;
}
else {
// 尚未完整接收到HTTP头,继续接收
continue;
}
}
fwrite(buffer, 1, bytes_received, file);
}
if (bytes_received == 0) {
printf("文件下载成功:%s\n", output_file);
}
else if (bytes_received < 0) {
printf("接收HTTP响应时出错\n");
ERR_print_errors_fp(stderr);
}
fclose(file);
SSL_free(ssl);
}编译运行crawlerc.exe。
查看文件下载成功。
五、修改代码,使其同时支持http与https协议解析文件名
代码实现
main.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
int main(int argc,char**argv) {
const char* url = NULL;
if (argv[1] != NULL)
{
url = argv[1];
printf("adress:\n\t%s\n",argv[1]);
}
else
{
printf("help: get domain");
}
bool use_ssl = strstr(url, "https://") == url; // 根据URL选择HTTP或HTTPS
int port = use_ssl ? 443 : 80; // 根据协议选择端口
char host[256], path[256] = "";
sscanf(url, use_ssl ? "https://%255[^/]/%255[^\n]" : "http://%255[^/]/%255[^\n]", host, path);
// 解析文件名
const char* filename = strrchr(path, '/') + 1;
printf("下载文件名:%s\n", filename);
// 初始化Winsock和创建套接字
initialize_winsock();
SOCKET sockfd = create_socket();
// 解析主机名并获取服务器地址
struct sockaddr_in server_addr;
resolve_hostname(host, port, &server_addr);
// 连接服务器
if (connect(sockfd, (struct sockaddr*)&server_addr, sizeof(server_addr)) < 0) {
printf("连接服务器失败\n");
closesocket(sockfd);
WSACleanup();
return 1;
}
SSL_CTX* ctx = NULL;
SSL* ssl = NULL;
if (use_ssl) {
// 创建SSL上下文并建立SSL连接
ctx = create_ssl_context();
ssl = connect_ssl(ctx, sockfd);
}
// 发送HTTP请求并接收响应
send_http_request(ssl, sockfd, host, path, use_ssl);
receive_http_response(ssl, sockfd, filename, use_ssl);
// 清理SSL
if (use_ssl) {
SSL_CTX_free(ctx);
ERR_free_strings();
}
// 清理Winsock
WSACleanup();
return 0;
}http_client_utils.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
void initialize_winsock();
SOCKET create_socket();
void resolve_hostname(const char* hostname, int port, struct sockaddr_in* server_addr);
SSL_CTX* create_ssl_context();
SSL* connect_ssl(SSL_CTX* ctx, SOCKET sockfd);
void send_http_request(SSL* ssl, SOCKET sockfd, const char* host, const char* path, bool use_ssl);
void receive_http_response(SSL* ssl, SOCKET sockfd, const char* output_file, bool use_ssl);http_client_utils.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
void initialize_winsock() {
WSADATA wsaData;
if (WSAStartup(MAKEWORD(2, 2), &wsaData) != 0) {
printf("Winsock初始化失败\n");
exit(1);
}
}
SOCKET create_socket() {
SOCKET sockfd = socket(AF_INET, SOCK_STREAM, 0);
if (sockfd == INVALID_SOCKET) {
printf("创建套接字失败\n");
WSACleanup();
exit(1);
}
return sockfd;
}
void resolve_hostname(const char* hostname, int port, struct sockaddr_in* server_addr) {
struct addrinfo hints, * res;
char port_str[6];
snprintf(port_str, sizeof(port_str), "%d", port);
memset(&hints, 0, sizeof(hints));
hints.ai_family = AF_INET;
hints.ai_socktype = SOCK_STREAM;
if (getaddrinfo(hostname, port_str, &hints, &res) != 0) {
printf("无法解析主机名\n");
WSACleanup();
exit(1);
}
memcpy(server_addr, res->ai_addr, sizeof(struct sockaddr_in));
freeaddrinfo(res);
}
SSL_CTX* create_ssl_context() {
SSL_CTX* ctx;
SSL_load_error_strings();
OpenSSL_add_ssl_algorithms();
ctx = SSL_CTX_new(TLS_client_method());
if (!ctx) {
printf("无法创建SSL上下文\n");
ERR_print_errors_fp(stderr);
exit(1);
}
return ctx;
}
SSL* connect_ssl(SSL_CTX* ctx, SOCKET sockfd) {
SSL* ssl = SSL_new(ctx);
if (!ssl) {
printf("无法创建SSL结构\n");
ERR_print_errors_fp(stderr);
exit(1);
}
SSL_set_fd(ssl, sockfd);
if (SSL_connect(ssl) <= 0) {
printf("无法建立SSL连接\n");
ERR_print_errors_fp(stderr);
exit(1);
}
// 添加调试信息
printf("SSL握手成功\n");
return ssl;
}
void send_http_request(SSL* ssl, SOCKET sockfd, const char* host, const char* path, bool use_ssl) {
char request[4096];
snprintf(request, sizeof(request), "GET /%s HTTP/1.1\r\n"
"Host: % s\r\n"
"Connection: close\r\n"
"User - Agent: Mozilla / 5.0 (Windows NT 10.0; Win64; x64) AppleWebKit / 537.36 (KHTML, like Gecko) Chrome / 58.0.3029.110 Safari / 537.3\r\n\r\n"
, path, host);
if (use_ssl) {
SSL_write(ssl, request, strlen(request));
}
else {
send(sockfd, request, strlen(request), 0);
}
}
void receive_http_response(SSL* ssl, SOCKET sockfd, const char* output_file, bool use_ssl) {
FILE* file = fopen(output_file, "wb");
if (!file) {
printf("打开输出文件失败\n");
if (use_ssl) {
SSL_free(ssl);
}
else {
closesocket(sockfd);
}
return;
}
char buffer[4096];
int bytes_received;
bool header_passed = false;
size_t total_bytes_received = 0;
size_t content_length = 0;
while ((bytes_received = (use_ssl ? SSL_read(ssl, buffer, sizeof(buffer)) : recv(sockfd, buffer, sizeof(buffer), 0))) > 0) {
if (!header_passed) {
// 打印并跳过HTTP响应头
char* header_end = strstr(buffer, "\r\n\r\n");
if (header_end) {
*header_end = '\0';
printf("HTTP响应头:\n%s\n", buffer);
// 解析Content-Length
char* content_length_str = strstr(buffer, "Content-Length: ");
if (content_length_str) {
content_length_str += strlen("Content-Length: ");
content_length = strtoul(content_length_str, NULL, 10);
}
header_end += 4; // 跳过"\r\n\r\n"
bytes_received -= (header_end - buffer);
memmove(buffer, header_end, bytes_received);
header_passed = true;
}
else {
// 尚未完整接收到HTTP头,继续接收
continue;
}
}
fwrite(buffer, 1, bytes_received, file);
total_bytes_received += bytes_received;
}
if (bytes_received < 0) {
printf("接收HTTP响应时出错\n");
if (use_ssl) {
ERR_print_errors_fp(stderr);
}
else {
printf("错误代码:%d\n", WSAGetLastError());
}
}
printf("总共接收到字节数:%zu\n", total_bytes_received);
if (total_bytes_received == content_length) {
printf("文件下载成功:%s\n", output_file);
}
else {
printf("文件下载不完整:%s\n", output_file);
printf("预期字节数:%zu, 实际接收字节数:%zu\n", content_length, total_bytes_received);
}
fclose(file);
if (use_ssl) {
SSL_free(ssl);
}
else {
closesocket(sockfd);
}
}编译运行crawlerc.exe。
查看文件下载成功。
All articles in this blog are licensed under CC BY-NC-SA 4.0 unless stating additionally.
