C语言写一个支持http与https下载的客户端

准备几个http网址

http://example.com

http://books.toscrape.com/

技术成就梦想51CTO-中国知名的数字化人才学习平台和技术社区

一、用C语言写一个windows客户端访问使用http协议的网站下载单个文件

1. 大概思路

  1. 思路为发送Get请求,
  2. 接收响应,
  3. 打印响应头 ,
  4. 将网页源码写入一个html文件中,

2.代码实现

  1. 编写代码访问http://example.com并下载html文件。

  2. main.c

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    #define _CRT_SECURE_NO_WARNINGS
    #include <stdio.h>
    #include <winsock2.h>
    #include "http_client_utils.h"
    #pragma comment(lib, "ws2_32.lib")

    int main() {
    const char* url = "http://example.com"; // 替换为您的URL
    const char* output_file = "example.html"; // 替换为输出文件名
    int port = 80; // 默认HTTP端口

    char host[256], path[256] = "";
    sscanf(url, "http://%255[^/]/%255[^\n]", host, path);

    // 初始化Winsock和创建套接字
    initialize_winsock();
    SOCKET sockfd = create_socket();

    // 解析主机名并获取服务器地址
    struct sockaddr_in server_addr;
    resolve_hostname(host, port, &server_addr);

    // 连接服务器
    if (connect(sockfd, (struct sockaddr*)&server_addr, sizeof(server_addr)) < 0) {
    printf("连接服务器失败\n");
    closesocket(sockfd);
    WSACleanup();
    return 1;
    }

    // 发送HTTP请求并接收响应
    send_http_request(sockfd, host, path);
    receive_http_response(sockfd, output_file);

    // 清理Winsock
    WSACleanup();
    return 0;
    }

  3. http_client_utils.h

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    // http_client_utils.h
    #ifndef HTTP_CLIENT_UTILS_H
    #define HTTP_CLIENT_UTILS_H

    #include <winsock2.h>

    void initialize_winsock();
    SOCKET create_socket();
    void resolve_hostname(const char* hostname, int port, struct sockaddr_in* server_addr);
    void send_http_request(SOCKET sockfd, const char* host, const char* path);
    void receive_http_response(SOCKET sockfd, const char* output_file);

    #endif // HTTP_CLIENT_UTILS_H
  4. http_client_utils.c

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    84
    85
    86
    87
    88
    89
    90
    91
    #include "http_client_utils.h"
    #include <stdio.h>
    #include <string.h>
    #include <stdbool.h>
    #include <ws2tcpip.h>

    void initialize_winsock() {
    WSADATA wsaData;
    if (WSAStartup(MAKEWORD(2, 2), &wsaData) != 0) {
    printf("Winsock初始化失败\n");
    exit(1);
    }
    }

    SOCKET create_socket() {
    SOCKET sockfd = socket(AF_INET, SOCK_STREAM, 0);
    if (sockfd == INVALID_SOCKET) {
    printf("创建套接字失败\n");
    WSACleanup();
    exit(1);
    }
    return sockfd;
    }

    void resolve_hostname(const char* hostname, int port, struct sockaddr_in* server_addr) {
    struct addrinfo hints, * res;
    char port_str[6];
    snprintf(port_str, sizeof(port_str), "%d", port);

    memset(&hints, 0, sizeof(hints));
    hints.ai_family = AF_INET;
    hints.ai_socktype = SOCK_STREAM;

    if (getaddrinfo(hostname, port_str, &hints, &res) != 0) {
    printf("无法解析主机名\n");
    WSACleanup();
    exit(1);
    }

    memcpy(server_addr, res->ai_addr, sizeof(struct sockaddr_in));
    freeaddrinfo(res);
    }

    void send_http_request(SOCKET sockfd, const char* host, const char* path) {
    char request[4096];
    snprintf(request, sizeof(request), "GET /%s HTTP/1.1\r\nHost: %s\r\nConnection: close\r\n\r\n", path, host);
    send(sockfd, request, strlen(request), 0);
    }

    void receive_http_response(SOCKET sockfd, const char* output_file) {
    FILE* file = fopen(output_file, "wb");
    if (!file) {
    printf("打开输出文件失败\n");
    closesocket(sockfd);
    return;
    }

    char buffer[4096];
    int bytes_received;
    bool header_passed = false;
    while ((bytes_received = recv(sockfd, buffer, sizeof(buffer), 0)) > 0) {
    if (!header_passed) {
    // 打印并跳过HTTP响应头
    char* header_end = strstr(buffer, "\r\n\r\n");
    if (header_end) {
    *header_end = '\0';
    printf("HTTP响应头:\n%s\n", buffer);
    header_end += 4; // 跳过"\r\n\r\n"
    bytes_received -= (header_end - buffer);
    memmove(buffer, header_end, bytes_received);
    header_passed = true;
    }
    else {
    // 尚未完整接收到HTTP头,继续接收
    continue;
    }
    }
    fwrite(buffer, 1, bytes_received, file);
    }

    if (bytes_received == 0) {
    printf("文件下载成功:%s\n", output_file);
    }
    else if (bytes_received < 0) {
    printf("接收HTTP响应时出错:%d\n", WSAGetLastError());
    }

    fclose(file);
    closesocket(sockfd);
    }

  5. 编译运行crawlerc.exe。

  6. 查看example.html下载成功。

二、修改代码,接收所有的请求并下载网页源码与所有文件。

1. 大概思路

  1. 使用tcp连接接
  2. 思路为发送Get请求,
  3. 接收响应,
  4. 解析响应头并打印,
  5. 将网页源码写入一个html文件中,
  6. 解析此网页中所有的链接,
  7. 关闭TCP连接
  8. 对着解析后的链接发送TCP 连接请求,
  9. 对着解析后的链接发送Get请求,
  10. 接收响应,
  11. 解析响应头并打印,
  12. 将数据写入到同名文件中。
  13. 关闭TCP连接
  14. 循环发送连接下载文件。

2.代码实现

  1. 编写代码访问http://books.toscrape.com/并下载所有文件。

  2. main.c

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    // main.c
    #include <stdio.h>
    #include <winsock2.h>
    #include "http_client_utils.h"

    #pragma comment(lib, "ws2_32.lib")

    int main() {
    const char* url = "http://books.toscrape.com/"; // 替换为您的URL
    const char* output_dir = "downloaded_files"; // 新建文件夹保存下载的文件
    const char* output_file = "downloaded_files/toscrape.html"; // 替换为输出文件名
    int port = 80; // 默认HTTP端口

    char host[256], path[256] = "";
    sscanf_s(url, "http://%255[^/]/%255[^\n]", host, (unsigned int)sizeof(host), path, (unsigned int)sizeof(path));

    // 初始化Winsock和创建套接字
    initialize_winsock();
    SOCKET sockfd = create_socket();

    // 解析主机名并获取服务器地址
    struct sockaddr_in server_addr;
    resolve_hostname(host, port, &server_addr);

    // 连接服务器
    if (connect(sockfd, (struct sockaddr*)&server_addr, sizeof(server_addr)) < 0) {
    printf("连接服务器失败\n");
    closesocket(sockfd);
    WSACleanup();
    return 1;
    }

    // 发送HTTP请求并接收响应
    char* html_content = NULL;
    send_http_request(sockfd, host, path);
    receive_http_response(sockfd, output_file, &html_content);

    // 下载附加文件
    if (html_content) {
    download_additional_files(html_content, "http://books.toscrape.com", output_dir);
    free(html_content);
    }

    // 清理Winsock
    WSACleanup();
    return 0;
    }
  3. http_client_utils.h

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    // http_client_utils.h
    #ifndef HTTP_CLIENT_UTILS_H
    #define HTTP_CLIENT_UTILS_H

    #include <winsock2.h>
    #include <stdbool.h>

    void initialize_winsock();
    SOCKET create_socket();
    void resolve_hostname(const char* hostname, int port, struct sockaddr_in* server_addr);
    void send_http_request(SOCKET sockfd, const char* host, const char* path);
    void receive_http_response(SOCKET sockfd, const char* output_file, char** html_content);
    bool is_absolute_url(const char* url);
    void download_additional_files(const char* html_content, const char* base_url, const char* output_dir);

    #endif // HTTP_CLIENT_UTILS_H

  4. http_client_utils.c

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    84
    85
    86
    87
    88
    89
    90
    91
    92
    93
    94
    95
    96
    97
    98
    99
    100
    101
    102
    103
    104
    105
    106
    107
    108
    109
    110
    111
    112
    113
    114
    115
    116
    117
    118
    119
    120
    121
    122
    123
    124
    125
    126
    127
    128
    129
    130
    131
    132
    133
    134
    135
    136
    137
    138
    139
    140
    141
    142
    143
    144
    145
    146
    147
    148
    149
    150
    151
    152
    153
    154
    155
    156
    157
    158
    159
    160
    161
    162
    163
    164
    165
    166
    167
    168
    169
    170
    171
    172
    173
    174
    175
    176
    177
    178
    179
    180
    181
    182
    183
    184
    185
    186
    187
    188
    189
    190
    191
    192
    193
    194
    195
    196
    197
    198
    199
    200
    201
    202
    203
    204
    205
    206
    207
    208
    209
    210
    211
    // http_client_utils.c
    #include "http_client_utils.h"
    #include <stdio.h>
    #include <string.h>
    #include <ws2tcpip.h>
    #include <stdlib.h>
    #include <direct.h> // 用于创建目录


    void initialize_winsock() {
    WSADATA wsaData;
    if (WSAStartup(MAKEWORD(2, 2), &wsaData) != 0) {
    printf("Winsock初始化失败\n");
    exit(1);
    }
    }

    SOCKET create_socket() {
    SOCKET sockfd = socket(AF_INET, SOCK_STREAM, 0);
    if (sockfd == INVALID_SOCKET) {
    printf("创建套接字失败\n");
    WSACleanup();
    exit(1);
    }
    return sockfd;
    }

    void resolve_hostname(const char* hostname, int port, struct sockaddr_in* server_addr) {
    struct addrinfo hints, * res;
    char port_str[6];
    snprintf(port_str, sizeof(port_str), "%d", port);

    memset(&hints, 0, sizeof(hints));
    hints.ai_family = AF_INET;
    hints.ai_socktype = SOCK_STREAM;

    if (getaddrinfo(hostname, port_str, &hints, &res) != 0) {
    printf("无法解析主机名\n");
    WSACleanup();
    exit(1);
    }

    memcpy(server_addr, res->ai_addr, sizeof(struct sockaddr_in));
    freeaddrinfo(res);
    }

    void send_http_request(SOCKET sockfd, const char* host, const char* path) {
    char request[4096];
    snprintf(request, sizeof(request), "GET /%s HTTP/1.1\r\nHost: %s\r\nConnection: close\r\n\r\n", path, host);
    send(sockfd, request, strlen(request), 0);
    }


    void receive_http_response(SOCKET sockfd, const char* output_file, char** html_content) {
    // 检查文件夹是否存在,如果不存在则创建
    char output_folder[512];
    strncpy_s(output_folder, sizeof(output_folder), output_file, _TRUNCATE);
    char* last_slash = strrchr(output_folder, '/');
    if (last_slash) {
    *last_slash = '\0'; // 终止字符串
    _mkdir(output_folder); // 创建文件夹
    *last_slash = '/'; // 恢复字符串
    }

    FILE* file = fopen(output_file, "wb");
    if (!file) {
    printf("打开输出文件失败:%s\n", output_file);
    closesocket(sockfd);
    return;
    }

    char buffer[4096];
    int bytes_received;
    bool header_passed = false;
    size_t content_length = 0;
    char* content = NULL;

    while ((bytes_received = recv(sockfd, buffer, sizeof(buffer), 0)) > 0) {
    if (!header_passed) {
    // 查找并打印HTTP响应头
    char* header_end = strstr(buffer, "\r\n\r\n");
    if (header_end) {
    *header_end = '\0'; // 终止头部字符串
    printf("HTTP响应头:\n%s\n", buffer);

    // 恢复HTTP响应体并写入文件
    header_end += 4;
    bytes_received -= (header_end - buffer);
    memmove(buffer, header_end, bytes_received);
    header_passed = true;
    }
    else {
    // 响应头尚未完全接收
    continue;
    }
    }
    fwrite(buffer, 1, bytes_received, file);

    // 存储响应体内容
    content = realloc(content, content_length + bytes_received + 1);
    if (!content) {
    printf("内存分配失败\n");
    fclose(file);
    closesocket(sockfd);
    return;
    }
    memcpy(content + content_length, buffer, bytes_received);
    content_length += bytes_received;
    }

    if (bytes_received == 0) {
    printf("文件下载成功:%s\n", output_file);
    }
    else if (bytes_received < 0) {
    printf("接收HTTP响应时出错:%d\n", WSAGetLastError());
    }

    content[content_length] = '\0';
    *html_content = content;

    fclose(file);
    closesocket(sockfd);
    }
    bool is_absolute_url(const char* url) {
    return strstr(url, "http://") == url || strstr(url, "https://") == url;
    }

    void download_additional_files(const char* html_content, const char* base_url, const char* output_dir) {
    const char* link_start;
    const char* cursor = html_content;

    // 创建下载文件夹
    _mkdir(output_dir);

    while ((link_start = strstr(cursor, "href=\"")) || (link_start = strstr(cursor, "src=\""))) {
    if ((link_start = strstr(cursor, "href=\"")) && (strstr(cursor, "src=\"") == NULL || link_start < strstr(cursor, "src=\""))) {
    link_start += 6; // 跳过 "href=\""
    }
    else if ((link_start = strstr(cursor, "src=\""))) {
    link_start += 5; // 跳过 "src=\""
    }
    else {
    break;
    }

    const char* link_end = strchr(link_start, '"');
    if (!link_end) {
    break;
    }

    size_t link_length = link_end - link_start;
    char* link = malloc(link_length + 1);
    if (!link) {
    printf("内存分配失败\n");
    return;
    }
    strncpy_s(link, link_length + 1, link_start, link_length);

    // 确定完整的下载 URL
    char full_url[4096];
    if (is_absolute_url(link)) {
    strncpy_s(full_url, sizeof(full_url), link, _TRUNCATE);
    }
    else {
    // 去掉基URL中的末尾斜杠,防止双斜杠
    size_t base_url_length = strlen(base_url);
    if (base_url[base_url_length - 1] == '/') {
    base_url_length--;
    }
    snprintf(full_url, sizeof(full_url), "%.*s/%s", (int)base_url_length, base_url, link);
    }
    printf("下载链接: %s\n", full_url);

    // 下载文件
    char host[256], path[256] = "";
    sscanf_s(full_url, "http://%255[^/]/%255[^\n]", host, (unsigned int)sizeof(host), path, (unsigned int)sizeof(path));

    initialize_winsock();
    SOCKET sockfd = create_socket();
    struct sockaddr_in server_addr;
    resolve_hostname(host, 80, &server_addr);
    if (connect(sockfd, (struct sockaddr*)&server_addr, sizeof(server_addr)) < 0) {
    printf("连接服务器失败\n");
    closesocket(sockfd);
    WSACleanup();
    free(link);
    continue;
    }
    send_http_request(sockfd, host, path);

    // 创建保存路径
    char output_file[512];
    snprintf(output_file, sizeof(output_file), "%s/%s", output_dir, link);

    // 确保目录存在(如果需要可以递归创建子目录)
    for (char* p = output_file; *p; ++p) {
    if (*p == '/') {
    *p = '\0';
    _mkdir(output_file);
    *p = '/';
    }
    }

    char* dummy_content;
    receive_http_response(sockfd, output_file, &dummy_content);
    free(link);

    cursor = link_end;
    }
    }

  5. 编译运行crawlerc.exe。

  6. 查看文件夹下载成功。

三、修改代码,接收所有的请求并下载所有jpg格式的文件。

1. 大概思路

  1. TCP连接
  2. 发送Get请求,
  3. 接收响应,
  4. 解析响应头并打印,
  5. 将网页源码写入一个html文件中,
  6. 解析此网页中所有的图片 链接,
  7. 保存所有链接,
  8. 关闭TCP连接
  9. 开启 TCP连接
  10. 对着解析后的链接发送Get请求,
  11. 接收响应,
  12. 解析响应头并打印,
  13. 将数据写入到同名文件中,
  14. 关闭TCP连接
  15. 循环发送TCP与Get请求下载所有文件,

2.代码实现

  1. 编写代码访问http://books.toscrape.com/并下载所有文件。

  2. main.c

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    #include <stdio.h>
    #include <winsock2.h>
    #include "http_client_utils.h"

    #pragma comment(lib, "ws2_32.lib")

    int main() {
    const char* url = "http://books.toscrape.com/"; // 替换为您的URL
    const char* output_dir = "downloaded_files"; // 新建文件夹保存下载的文件
    const char* output_file = "downloaded_files/toscrape.html"; // 替换为输出文件名
    int port = 80; // 默认HTTP端口

    char host[256], path[256] = "";
    sscanf_s(url, "http://%255[^/]/%255[^\n]", host, (unsigned int)sizeof(host), path, (unsigned int)sizeof(path));

    // 初始化Winsock和创建套接字
    initialize_winsock();
    SOCKET sockfd = create_socket();

    // 声明 server_addr 变量
    struct sockaddr_in server_addr;

    // 解析主机名并获取服务器地址
    resolve_hostname(host, port, &server_addr);

    // 连接服务器
    if (connect(sockfd, (struct sockaddr*)&server_addr, sizeof(server_addr)) < 0) {
    printf("连接服务器失败\n");
    closesocket(sockfd);
    WSACleanup();
    return 1;
    }

    // 发送HTTP请求并接收响应
    char* html_content = NULL;
    send_http_request(sockfd, host, path);
    receive_http_response(sockfd, output_file, &html_content);

    // 提取JPG图片链接
    char** links = NULL;
    int num_links = 0;
    if (html_content) {
    extract_jpg_links(html_content, "http://books.toscrape.com", &links, &num_links);
    free(html_content);
    }

    // 批量下载JPG文件
    if (num_links > 0) {
    download_jpg_files((const char**)links, num_links, output_dir);
    for (int i = 0; i < num_links; i++) {
    free(links[i]);
    }
    free(links);
    }

    // 清理Winsock
    WSACleanup();
    return 0;
    }

  3. http_client_utils.h

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    #ifndef HTTP_CLIENT_UTILS_H
    #define HTTP_CLIENT_UTILS_H

    #include <winsock2.h>
    #include <stdbool.h>

    void initialize_winsock();
    SOCKET create_socket();
    void resolve_hostname(const char* hostname, int port, struct sockaddr_in* server_addr);
    void send_http_request(SOCKET sockfd, const char* host, const char* path);
    void receive_http_response(SOCKET sockfd, const char* output_file, char** html_content);
    bool is_absolute_url(const char* url);
    void extract_jpg_links(const char* html_content, const char* base_url, char** links[], int* num_links);
    void download_jpg_files(const char* links[], int num_links, const char* output_dir);

    #endif // HTTP_CLIENT_UTILS_H

  4. http_client_utils.c

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    84
    85
    86
    87
    88
    89
    90
    91
    92
    93
    94
    95
    96
    97
    98
    99
    100
    101
    102
    103
    104
    105
    106
    107
    108
    109
    110
    111
    112
    113
    114
    115
    116
    117
    118
    119
    120
    121
    122
    123
    124
    125
    126
    127
    128
    129
    130
    131
    132
    133
    134
    135
    136
    137
    138
    139
    140
    141
    142
    143
    144
    145
    146
    147
    148
    149
    150
    151
    152
    153
    154
    155
    156
    157
    158
    159
    160
    161
    162
    163
    164
    165
    166
    167
    168
    169
    170
    171
    172
    173
    174
    175
    176
    177
    178
    179
    180
    181
    182
    183
    184
    185
    186
    187
    188
    189
    190
    191
    192
    193
    194
    195
    196
    197
    198
    199
    200
    201
    202
    203
    204
    205
    206
    207
    208
    209
    210
    211
    212
    213
    214
    215
    216
    217
    218
    219
    #include "http_client_utils.h"
    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>
    #include <direct.h> // 用于创建目录

    void initialize_winsock() {
    WSADATA wsaData;
    if (WSAStartup(MAKEWORD(2, 2), &wsaData) != 0) {
    printf("Winsock初始化失败\n");
    exit(1);
    }
    }

    SOCKET create_socket() {
    SOCKET sockfd = socket(AF_INET, SOCK_STREAM, 0);
    if (sockfd == INVALID_SOCKET) {
    printf("创建套接字失败\n");
    WSACleanup();
    exit(1);
    }
    return sockfd;
    }

    void resolve_hostname(const char* hostname, int port, struct sockaddr_in* server_addr) {
    struct addrinfo hints, * res;
    char port_str[6];
    snprintf(port_str, sizeof(port_str), "%d", port);

    memset(&hints, 0, sizeof(hints));
    hints.ai_family = AF_INET;
    hints.ai_socktype = SOCK_STREAM;

    if (getaddrinfo(hostname, port_str, &hints, &res) != 0) {
    printf("无法解析主机名: %s\n", hostname);
    WSACleanup();
    exit(1);
    }

    memcpy(server_addr, res->ai_addr, sizeof(struct sockaddr_in));
    freeaddrinfo(res);
    }

    void send_http_request(SOCKET sockfd, const char* host, const char* path) {
    char request[4096];
    snprintf(request, sizeof(request), "GET /%s HTTP/1.1\r\nHost: %s\r\nConnection: close\r\n\r\n", path, host);
    send(sockfd, request, strlen(request), 0);
    }

    void receive_http_response(SOCKET sockfd, const char* output_file, char** html_content) {
    // 检查文件夹是否存在,如果不存在则创建
    char output_folder[512];
    strncpy_s(output_folder, sizeof(output_folder), output_file, _TRUNCATE);
    char* last_slash = strrchr(output_folder, '/');
    if (last_slash) {
    *last_slash = '\0'; // 终止字符串
    _mkdir(output_folder); // 创建文件夹
    *last_slash = '/'; // 恢复字符串
    }

    FILE* file = fopen(output_file, "wb");
    if (!file) {
    printf("打开输出文件失败:%s\n", output_file);
    closesocket(sockfd);
    return;
    }

    char buffer[4096];
    int bytes_received;
    bool header_passed = false;
    size_t content_length = 0;
    char* content = NULL;

    while ((bytes_received = recv(sockfd, buffer, sizeof(buffer), 0)) > 0) {
    if (!header_passed) {
    // 查找并打印HTTP响应头
    char* header_end = strstr(buffer, "\r\n\r\n");
    if (header_end) {
    *header_end = '\0'; // 终止头部字符串
    printf("HTTP响应头:\n%s\n", buffer);

    // 恢复HTTP响应体并写入文件
    header_end += 4;
    bytes_received -= (header_end - buffer);
    memmove(buffer, header_end, bytes_received);
    header_passed = true;
    }
    else {
    // 响应头尚未完全接收
    continue;
    }
    }
    fwrite(buffer, 1, bytes_received, file);

    // 存储响应体内容
    content = realloc(content, content_length + bytes_received + 1);
    if (!content) {
    printf("内存分配失败\n");
    fclose(file);
    closesocket(sockfd);
    return;
    }
    memcpy(content + content_length, buffer, bytes_received);
    content_length += bytes_received;
    }

    if (bytes_received == 0) {
    printf("文件下载成功:%s\n", output_file);
    }
    else if (bytes_received < 0) {
    printf("接收HTTP响应时出错:%d\n", WSAGetLastError());
    }

    content[content_length] = '\0';
    *html_content = content;

    fclose(file);
    closesocket(sockfd);
    }

    bool is_absolute_url(const char* url) {
    return strstr(url, "http://") == url || strstr(url, "https://") == url;
    }

    void extract_jpg_links(const char* html_content, const char* base_url, char** links[], int* num_links) {
    const char* link_start;
    const char* cursor = html_content;

    // 分配初始链接数组
    *links = malloc(10 * sizeof(char*));
    *num_links = 0;
    int links_capacity = 10;

    while ((link_start = strstr(cursor, "src=\""))) {
    link_start += 5; // 跳过 "src=\""

    const char* link_end = strchr(link_start, '"');
    if (!link_end) {
    break;
    }

    size_t link_length = link_end - link_start;
    char* link = malloc(link_length + 1);
    if (!link) {
    printf("内存分配失败\n");
    return;
    }
    strncpy_s(link, link_length + 1, link_start, link_length);

    // 仅提取JPG文件链接
    if (strstr(link, ".jpg") || strstr(link, ".jpeg")) {
    // 确定完整的下载 URL
    char full_url[4096];
    if (is_absolute_url(link)) {
    strncpy_s(full_url, sizeof(full_url), link, _TRUNCATE);
    }
    else {
    // 去掉基URL中的末尾斜杠,防止双斜杠
    size_t base_url_length = strlen(base_url);
    if (base_url[base_url_length - 1] == '/') {
    base_url_length--;
    }
    snprintf(full_url, sizeof(full_url), "%.*s/%s", (int)base_url_length, base_url, link);
    }

    // 将链接添加到数组中
    if (*num_links >= links_capacity) {
    links_capacity *= 2;
    *links = realloc(*links, links_capacity * sizeof(char*));
    }
    (*links)[*num_links] = _strdup(full_url);
    (*num_links)++;
    }

    free(link);
    cursor = link_end;
    }
    }

    void download_jpg_files(const char* links[], int num_links, const char* output_dir) {
    // 创建下载文件夹
    _mkdir(output_dir);

    // 批量下载图片
    for (int i = 0; i < num_links; i++) {
    printf("下载链接: %s\n", links[i]);

    // 提取主机名和路径
    char host[256], path[256] = "";
    sscanf_s(links[i], "http://%255[^/]/%255[^\n]", host, (unsigned int)sizeof(host), path, (unsigned int)sizeof(path));

    initialize_winsock();
    SOCKET sockfd = create_socket();
    struct sockaddr_in server_addr;
    resolve_hostname(host, 80, &server_addr);
    if (connect(sockfd, (struct sockaddr*)&server_addr, sizeof(server_addr)) < 0) {
    printf("连接服务器失败\n");
    closesocket(sockfd);
    WSACleanup();
    continue;
    }
    send_http_request(sockfd, host, path);

    // 创建保存路径,只保存在output_dir文件夹中,不保留原始路径结构
    char output_file[512];
    snprintf(output_file, sizeof(output_file), "%s/%s", output_dir, strrchr(path, '/') + 1);

    FILE* file = fopen(output_file, "wb");
    if (!file) {
    printf("打开输出文件失败:%s\n", output_file);
    }
    else {
    char* dummy_content;
    receive_http_response(sockfd, output_file, &dummy_content);
    fclose(file);
    }
    }
    }

  5. 编译运行crawlerc.exe。

  6. 查看文件夹下载成功。

四、修改代码,支持hppts协议 并使用User-Agent请求头假装正常浏览器访问并下载文件

1. 大概思路

  1. 思路为发送Get请求,
  2. 接收响应,
  3. 打印响应头 ,
  4. 将接受到的数据写入文件中,

2.代码实现

  1. 编写代码访问https://s5.51cto.com/oss/202408/30/a7a3092691d8f3fdb3322730c0fba80fd82f85.png并下载html文件。

  2. main.c

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    #define _CRT_SECURE_NO_WARNINGS
    #include <stdio.h>
    #include <winsock2.h>
    #include "http_client_utils.h"
    #pragma comment(lib, "ws2_32.lib")
    #include <openssl/ssl.h>
    #include <openssl/err.h>

    int main() {
    const char* url = "https://s5.51cto.com/oss/202408/30/a7a3092691d8f3fdb3322730c0fba80fd82f85.png"; // 替换为您的URL
    const char* output_file = "f85.png"; // 替换为输出文件名
    int port = 443; // 默认HTTPS端口

    char host[256], path[256] = "";
    sscanf(url, "https://%255[^/]/%255[^\n]", host, path);

    // 初始化Winsock和创建套接字
    initialize_winsock();
    SOCKET sockfd = create_socket();

    // 解析主机名并获取服务器地址
    struct sockaddr_in server_addr;
    resolve_hostname(host, port, &server_addr);

    // 连接服务器
    if (connect(sockfd, (struct sockaddr*)&server_addr, sizeof(server_addr)) < 0) {
    printf("连接服务器失败\n");
    closesocket(sockfd);
    WSACleanup();
    return 1;
    }

    // 创建SSL上下文并建立SSL连接
    SSL_CTX* ctx = create_ssl_context();
    SSL* ssl = connect_ssl(ctx, sockfd);

    // 发送HTTP请求并接收响应
    send_http_request(ssl, host, path);
    receive_http_response(ssl, output_file);

    // 清理SSL
    SSL_CTX_free(ctx);
    ERR_free_strings();

    // 清理Winsock
    WSACleanup();
    return 0;
    }

  3. http_client_utils.h

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    #ifndef HTTP_CLIENT_UTILS_H
    #define HTTP_CLIENT_UTILS_H

    #include <winsock2.h>
    #include <openssl/ssl.h>

    void initialize_winsock();
    SOCKET create_socket();
    void resolve_hostname(const char* hostname, int port, struct sockaddr_in* server_addr);
    SSL_CTX* create_ssl_context();
    SSL* connect_ssl(SSL_CTX* ctx, SOCKET sockfd);
    void send_http_request(SSL* ssl, const char* host, const char* path);
    void receive_http_response(SSL* ssl, const char* output_file);

    #endif // HTTP_CLIENT_UTILS_H
  4. http_client_utils.c

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    84
    85
    86
    87
    88
    89
    90
    91
    92
    93
    94
    95
    96
    97
    98
    99
    100
    101
    102
    103
    104
    105
    106
    107
    108
    109
    110
    111
    112
    113
    114
    115
    116
    117
    118
    119
    120
    121
    122
    123
    124
    125
    126
    127
    128
    129
    130
    131
    #include "http_client_utils.h"
    #include <stdio.h>
    #include <string.h>
    #include <stdbool.h>
    #include <ws2tcpip.h>
    #include <openssl/ssl.h>
    #include <openssl/err.h>

    void initialize_winsock() {
    WSADATA wsaData;
    if (WSAStartup(MAKEWORD(2, 2), &wsaData) != 0) {
    printf("Winsock初始化失败\n");
    exit(1);
    }
    }

    SOCKET create_socket() {
    SOCKET sockfd = socket(AF_INET, SOCK_STREAM, 0);
    if (sockfd == INVALID_SOCKET) {
    printf("创建套接字失败\n");
    WSACleanup();
    exit(1);
    }
    return sockfd;
    }

    void resolve_hostname(const char* hostname, int port, struct sockaddr_in* server_addr) {
    struct addrinfo hints, * res;
    char port_str[6];
    snprintf(port_str, sizeof(port_str), "%d", port);

    memset(&hints, 0, sizeof(hints));
    hints.ai_family = AF_INET;
    hints.ai_socktype = SOCK_STREAM;

    if (getaddrinfo(hostname, port_str, &hints, &res) != 0) {
    printf("无法解析主机名\n");
    WSACleanup();
    exit(1);
    }

    memcpy(server_addr, res->ai_addr, sizeof(struct sockaddr_in));
    freeaddrinfo(res);
    }

    SSL_CTX* create_ssl_context() {
    SSL_CTX* ctx;
    SSL_load_error_strings();
    OpenSSL_add_ssl_algorithms();

    ctx = SSL_CTX_new(TLS_client_method());
    if (!ctx) {
    printf("无法创建SSL上下文\n");
    ERR_print_errors_fp(stderr);
    exit(1);
    }

    return ctx;
    }

    SSL* connect_ssl(SSL_CTX* ctx, SOCKET sockfd) {
    SSL* ssl = SSL_new(ctx);
    if (!ssl) {
    printf("无法创建SSL结构\n");
    ERR_print_errors_fp(stderr);
    exit(1);
    }

    SSL_set_fd(ssl, sockfd);
    if (SSL_connect(ssl) <= 0) {
    printf("无法建立SSL连接\n");
    ERR_print_errors_fp(stderr);
    exit(1);
    }

    return ssl;
    }

    void send_http_request(SSL* ssl, const char* host, const char* path) {
    char request[4096];
    snprintf(request, sizeof(request), "GET /%s HTTP/1.1\r\n"
    "Host: %s\r\n"
    "Connection: close\r\n"
    "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3\r\n\r\n",
    path, host);
    SSL_write(ssl, request, strlen(request));
    }

    void receive_http_response(SSL* ssl, const char* output_file) {
    FILE* file = fopen(output_file, "wb");
    if (!file) {
    printf("打开输出文件失败\n");
    SSL_free(ssl);
    return;
    }

    char buffer[4096];
    int bytes_received;
    bool header_passed = false;
    while ((bytes_received = SSL_read(ssl, buffer, sizeof(buffer))) > 0) {
    if (!header_passed) {
    // 打印并跳过HTTP响应头
    char* header_end = strstr(buffer, "\r\n\r\n");
    if (header_end) {
    *header_end = '\0';
    printf("HTTP响应头:\n%s\n", buffer);
    header_end += 4; // 跳过"\r\n\r\n"
    bytes_received -= (header_end - buffer);
    memmove(buffer, header_end, bytes_received);
    header_passed = true;
    }
    else {
    // 尚未完整接收到HTTP头,继续接收
    continue;
    }
    }
    fwrite(buffer, 1, bytes_received, file);
    }

    if (bytes_received == 0) {
    printf("文件下载成功:%s\n", output_file);
    }
    else if (bytes_received < 0) {
    printf("接收HTTP响应时出错\n");
    ERR_print_errors_fp(stderr);
    }

    fclose(file);
    SSL_free(ssl);
    }

  5. 编译运行crawlerc.exe。

  6. 查看文件下载成功。

五、修改代码,使其同时支持http与https协议解析文件名

代码实现

  1. main.c

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    #define _CRT_SECURE_NO_WARNINGS
    #include <stdio.h>
    #include <winsock2.h>
    #include "http_client_utils.h"
    #pragma comment(lib, "ws2_32.lib")
    #include <openssl/ssl.h>
    #include <openssl/err.h>

    int main(int argc,char**argv) {

    const char* url = NULL;
    if (argv[1] != NULL)
    {
    url = argv[1];
    printf("adress:\n\t%s\n",argv[1]);
    }
    else
    {
    printf("help: get domain");
    }
    bool use_ssl = strstr(url, "https://") == url; // 根据URL选择HTTP或HTTPS
    int port = use_ssl ? 443 : 80; // 根据协议选择端口

    char host[256], path[256] = "";
    sscanf(url, use_ssl ? "https://%255[^/]/%255[^\n]" : "http://%255[^/]/%255[^\n]", host, path);

    // 解析文件名
    const char* filename = strrchr(path, '/') + 1;
    printf("下载文件名:%s\n", filename);

    // 初始化Winsock和创建套接字
    initialize_winsock();
    SOCKET sockfd = create_socket();

    // 解析主机名并获取服务器地址
    struct sockaddr_in server_addr;
    resolve_hostname(host, port, &server_addr);

    // 连接服务器
    if (connect(sockfd, (struct sockaddr*)&server_addr, sizeof(server_addr)) < 0) {
    printf("连接服务器失败\n");
    closesocket(sockfd);
    WSACleanup();
    return 1;
    }

    SSL_CTX* ctx = NULL;
    SSL* ssl = NULL;

    if (use_ssl) {
    // 创建SSL上下文并建立SSL连接
    ctx = create_ssl_context();
    ssl = connect_ssl(ctx, sockfd);
    }

    // 发送HTTP请求并接收响应
    send_http_request(ssl, sockfd, host, path, use_ssl);
    receive_http_response(ssl, sockfd, filename, use_ssl);

    // 清理SSL
    if (use_ssl) {
    SSL_CTX_free(ctx);
    ERR_free_strings();
    }

    // 清理Winsock
    WSACleanup();
    return 0;
    }
  2. http_client_utils.h

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    #ifndef HTTP_CLIENT_UTILS_H
    #define HTTP_CLIENT_UTILS_H

    #include <winsock2.h>
    #include <openssl/ssl.h>
    #include <stdbool.h>

    void initialize_winsock();
    SOCKET create_socket();
    void resolve_hostname(const char* hostname, int port, struct sockaddr_in* server_addr);
    SSL_CTX* create_ssl_context();
    SSL* connect_ssl(SSL_CTX* ctx, SOCKET sockfd);
    void send_http_request(SSL* ssl, SOCKET sockfd, const char* host, const char* path, bool use_ssl);
    void receive_http_response(SSL* ssl, SOCKET sockfd, const char* output_file, bool use_ssl);

    #endif // HTTP_CLIENT_UTILS_H

  3. http_client_utils.c

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    84
    85
    86
    87
    88
    89
    90
    91
    92
    93
    94
    95
    96
    97
    98
    99
    100
    101
    102
    103
    104
    105
    106
    107
    108
    109
    110
    111
    112
    113
    114
    115
    116
    117
    118
    119
    120
    121
    122
    123
    124
    125
    126
    127
    128
    129
    130
    131
    132
    133
    134
    135
    136
    137
    138
    139
    140
    141
    142
    143
    144
    145
    146
    147
    148
    149
    150
    151
    152
    153
    154
    155
    156
    157
    158
    159
    160
    161
    162
    163
    164
    165
    166
    167
    168
    169
    170
    171
    172
    #include "http_client_utils.h"
    #include <stdio.h>
    #include <string.h>
    #include <ws2tcpip.h>
    #include <openssl/ssl.h>
    #include <openssl/err.h>

    void initialize_winsock() {
    WSADATA wsaData;
    if (WSAStartup(MAKEWORD(2, 2), &wsaData) != 0) {
    printf("Winsock初始化失败\n");
    exit(1);
    }
    }

    SOCKET create_socket() {
    SOCKET sockfd = socket(AF_INET, SOCK_STREAM, 0);
    if (sockfd == INVALID_SOCKET) {
    printf("创建套接字失败\n");
    WSACleanup();
    exit(1);
    }
    return sockfd;
    }

    void resolve_hostname(const char* hostname, int port, struct sockaddr_in* server_addr) {
    struct addrinfo hints, * res;
    char port_str[6];
    snprintf(port_str, sizeof(port_str), "%d", port);

    memset(&hints, 0, sizeof(hints));
    hints.ai_family = AF_INET;
    hints.ai_socktype = SOCK_STREAM;

    if (getaddrinfo(hostname, port_str, &hints, &res) != 0) {
    printf("无法解析主机名\n");
    WSACleanup();
    exit(1);
    }

    memcpy(server_addr, res->ai_addr, sizeof(struct sockaddr_in));
    freeaddrinfo(res);
    }

    SSL_CTX* create_ssl_context() {
    SSL_CTX* ctx;
    SSL_load_error_strings();
    OpenSSL_add_ssl_algorithms();

    ctx = SSL_CTX_new(TLS_client_method());
    if (!ctx) {
    printf("无法创建SSL上下文\n");
    ERR_print_errors_fp(stderr);
    exit(1);
    }

    return ctx;
    }

    SSL* connect_ssl(SSL_CTX* ctx, SOCKET sockfd) {
    SSL* ssl = SSL_new(ctx);
    if (!ssl) {
    printf("无法创建SSL结构\n");
    ERR_print_errors_fp(stderr);
    exit(1);
    }

    SSL_set_fd(ssl, sockfd);
    if (SSL_connect(ssl) <= 0) {
    printf("无法建立SSL连接\n");
    ERR_print_errors_fp(stderr);
    exit(1);
    }

    // 添加调试信息
    printf("SSL握手成功\n");
    return ssl;
    }

    void send_http_request(SSL* ssl, SOCKET sockfd, const char* host, const char* path, bool use_ssl) {
    char request[4096];
    snprintf(request, sizeof(request), "GET /%s HTTP/1.1\r\n"
    "Host: % s\r\n"
    "Connection: close\r\n"
    "User - Agent: Mozilla / 5.0 (Windows NT 10.0; Win64; x64) AppleWebKit / 537.36 (KHTML, like Gecko) Chrome / 58.0.3029.110 Safari / 537.3\r\n\r\n"
    , path, host);

    if (use_ssl) {
    SSL_write(ssl, request, strlen(request));
    }
    else {
    send(sockfd, request, strlen(request), 0);
    }
    }

    void receive_http_response(SSL* ssl, SOCKET sockfd, const char* output_file, bool use_ssl) {
    FILE* file = fopen(output_file, "wb");
    if (!file) {
    printf("打开输出文件失败\n");
    if (use_ssl) {
    SSL_free(ssl);
    }
    else {
    closesocket(sockfd);
    }
    return;
    }

    char buffer[4096];
    int bytes_received;
    bool header_passed = false;
    size_t total_bytes_received = 0;
    size_t content_length = 0;

    while ((bytes_received = (use_ssl ? SSL_read(ssl, buffer, sizeof(buffer)) : recv(sockfd, buffer, sizeof(buffer), 0))) > 0) {
    if (!header_passed) {
    // 打印并跳过HTTP响应头
    char* header_end = strstr(buffer, "\r\n\r\n");
    if (header_end) {
    *header_end = '\0';
    printf("HTTP响应头:\n%s\n", buffer);

    // 解析Content-Length
    char* content_length_str = strstr(buffer, "Content-Length: ");
    if (content_length_str) {
    content_length_str += strlen("Content-Length: ");
    content_length = strtoul(content_length_str, NULL, 10);
    }

    header_end += 4; // 跳过"\r\n\r\n"
    bytes_received -= (header_end - buffer);
    memmove(buffer, header_end, bytes_received);
    header_passed = true;
    }
    else {
    // 尚未完整接收到HTTP头,继续接收
    continue;
    }
    }
    fwrite(buffer, 1, bytes_received, file);
    total_bytes_received += bytes_received;
    }

    if (bytes_received < 0) {
    printf("接收HTTP响应时出错\n");
    if (use_ssl) {
    ERR_print_errors_fp(stderr);
    }
    else {
    printf("错误代码:%d\n", WSAGetLastError());
    }
    }

    printf("总共接收到字节数:%zu\n", total_bytes_received);

    if (total_bytes_received == content_length) {
    printf("文件下载成功:%s\n", output_file);
    }
    else {
    printf("文件下载不完整:%s\n", output_file);
    printf("预期字节数:%zu, 实际接收字节数:%zu\n", content_length, total_bytes_received);
    }

    fclose(file);
    if (use_ssl) {
    SSL_free(ssl);
    }
    else {
    closesocket(sockfd);
    }
    }

  4. 编译运行crawlerc.exe。

  5. 查看文件下载成功。