11

Please note, I already know about the streaming nature of TCP connections; my question is not related to those kinds of things. It rather about my suspicion of there being a bug in the Linux sockets implementation.

Update: Taking comments into account, I updated my code a little bit to check the return value of recv() not only to -1 but to any negative value. That was just in case. The results are the same.

I have a very simple TCP client/server application written in C. The full code of this project is available on github.

The client side runs multiple parallel threads, and each of threads does the following:

  1. Open socket
  2. Connect this socket to server
  3. Write 16 bytes of a predefined data pattern to the socket by pieces of random length
  4. Close socket
  5. Repeat steps 1 to 4 N times
static size_t send_ex(int fd, const uint8_t *buff, size_t len, bool by_frags)
{
    if ( by_frags )
    {
        size_t chunk_len, pos;
        size_t res;

        for ( pos = 0; pos < len;  )
        {
            chunk_len = (size_t) random();
            chunk_len %= (len - pos);
            chunk_len++;

            res = send(fd, (const char *) &buff[pos], chunk_len, 0);
            if ( res != chunk_len) {
                return (size_t) -1;
            }

            pos += chunk_len;
        }

        return len;
    }

    return send(fd, buff, len, 0);
}

static void *connection_task(void *arg) 
{   
    connection_ctx_t *ctx = (connection_ctx_t *) arg;
    uint32_t buff[4] = {0xAA55AA55, 0x12345678, 0x12345678, 0x12345678};
    int res, fd, i;

    for ( i = 0; i < count; i++ )
    {
        fd = socket(AF_INET, SOCK_STREAM, 0);
        if ( fd < 0 ) {
            fprintf(stderr, "Can't create socket!\n");
            break;
        }

        res = connect(fd, (struct sockaddr *) ctx->serveraddr, sizeof(struct sockaddr_in));
        if ( res < 0 ) {
            fprintf(stderr, "Connect failed!\n");                    
            close(fd);
            break;
        }

        res = send_ex(fd, (const char *) buff, sizeof(buff), frags);
        if ( res != sizeof(buff) ) {
            fprintf(stderr, "Send failed!\n");
            close(fd);
            break;
        }

        ctx->sent_packs++;

        res = close(fd);
        if ( res < 0 ) {
            fprintf(stderr, "CLI: Close Failed!!\n");
        }

        msleep(delay);
    }

    return NULL;
}

The server side runs a thread on each incoming connection, that does the following:

  1. Read data from the connected socket until it has read all 16 bytes
  2. After reading at least the first 4 bytes, it is checked that these bytes are equal to a predefined pattern.
typedef struct client_ctx_s {
    struct sockaddr_in addr;
    int fd;
} client_ctx_t;

void *client_task(void *arg) 
{
    client_ctx_t *client = (client_ctx_t *) arg;
    size_t free_space, pos;
    ssize_t chunk_len;
    uint32_t buff[4] = {0};
    int res;

    pos = 0;
    while ( pos != sizeof(buff) )
    {
        free_space = sizeof(buff) - pos;
        assert(pos < sizeof(buff));

        chunk_len = recv(client->fd, &((uint8_t *) buff)[pos], free_space, 0);
        if ( chunk_len <= 0 ) {
            if ( chunk_len < 0 ) {
                fprintf(stderr, "%s:%u: ERROR: recv failed (errno = %d; pos = %zu)!\n",
                        inet_ntoa(client->addr.sin_addr), 
                        ntohs(client->addr.sin_port),
                        errno, pos);
            }
            else if ( pos && pos < sizeof(buff) ) {
                fprintf(stderr, "%s:%u: ERROR: incomplete data block (pos = %zu)!\n",
                        inet_ntoa(client->addr.sin_addr),
                        ntohs(client->addr.sin_port),
                        pos);
            }
            goto out;
        }

        assert(chunk_len <= free_space);
        pos += chunk_len;

        if ( pos >= 4 && buff[0] != 0xAA55AA55) {
            fprintf(stderr, "%s:%u: ERROR: data corrupted (%08x)!\n", 
                    inet_ntoa(client->addr.sin_addr), 
                    ntohs(client->addr.sin_port),
                    buff[0]);
        }
    }

    fprintf(stdout, "%s:%u: %08x %08x %08x %08x\n",
            inet_ntoa(client->addr.sin_addr),
            ntohs(client->addr.sin_port),
            buff[0], buff[1], buff[2], buff[3]);

out:
    debug("Connection closed\n");
    res = close(client->fd);
    assert(res == 0);
    free(client);
    return NULL;
}

Issues that came up when a client runs one thousand of sending threads, and each of them repeats connect-send-disconnect one hundred times (./client -t 1000 -c 100 -d 0 -f):

  1. Loss of first bytes of pattern that was send.
  2. Total size of data that was readed from socket accordingly less that 16 bytes.

image1

This behavior is repeatable both on local host and over a real network connection.

Examining the TCP flow of the corrupted data with Wireshark shows that:

  1. There is no issue on client side.
  2. Corrupted data corresponds data that carried with retransmitted segments of data.

image2

I can't really believe this problem lies in the Linux TCP/IP implementation. Can anybody explain what is wrong with my code?

4
  • Welcome to Stack Overflow! Please post code, data, and results as text, not screenshots (how to format code in posts). Why should I not upload images of code/data/errors? idownvotedbecau.se/imageofcode
    – Barmar
    Commented Jul 30 at 16:20
  • 2
    You're not checking for an error from recv(). It will return a negative chunk_len. The correct type is ssize_t, not size_t, so it can hold negative numbers.
    – Barmar
    Commented Jul 30 at 16:28
  • Is the socket in non-blocking mode on the server? Then recv() can return an EWOULDBLOCK error.
    – Barmar
    Commented Jul 30 at 16:29
  • 1
    @Barmar recv() can return only -1 on error and i check it. EWOULDBLOCK it is possible value of errno variable, not the return value of function
    – legden
    Commented Jul 30 at 16:45

4 Answers 4

5
+50

at first glance there is a similar problem here: https://wpbolt.com/syn-cookies-ate-my-dog-breaking-tcp-on-linux/

but in our case in wireshark see ack for all data packet. it still looks like a kernel bug.

To reproduce this error, it is not necessary to open a large number of TCP connections. 10 is enough.
This can be schematically reproduced as follows:

run server

...
listenfd = socket(...  
res = bind(listenfd, ...  
res = listen(listenfd, 1); !!! backlog set 1  
wait user key press (need wait add socket to backlog queue)

start client
run 10 thread with:

fd = socket(... 
z = setsockopt(fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one)); 
connect(fd ...  
for(int i=0;i<28;i++)  
    send(fd, &buff[i], 1, 0);  
recv()  

9 TCP streams enter in the backlog queue on the server side and begin re-sending SYN with increasing intervals.

at server side press enter, for unblock and

while(1)
  select([listenfd, socketN])
  listenfd: new connection
     accept(...) 
     add to socketN
  socketN: new data
     recv()

As a result, the first bytes of data in several TCP connections will be lost. This behavior is observed on the Ubuntu 24.04 with kernel 6.10.2.

1
  • Wow. Thanks for the link. For me it looks like exactly the same problem. So with linux SYN cookies implementaion you really can read corrupted data from TCP connection. I wonder how it is possible that this problem has not been solved for many years. For me it's a complete disaster that linux network stack implementation, commonly considered bulletproof, has such kind of a bug.
    – legden
    Commented Aug 2 at 15:49
3

In short: I think SYN-cookies are the root of problem

I don't know if the resulting behavior ("broken" first recv call after accept) is a kernel bug. As far as I understand, SYN-cookies feature allows the client to be "accepted" without him noticing anything. Your client seems like network attacker =).

Possibly useful links:


I found it rather strange that only the head of the sequence disappears... Next I checked dmesg and saw this:

TCP: request_sock_TCP: Possible SYN flooding on port 127.0.0.1:5050. Sending cookies.

Next I disable it (as far as I know, this is not recommended in production):

sudo sysctl net.ipv4.tcp_syncookies=0

After this, data corruption disappeared.


The following change in server.c:114 also fixes it:

--- a/server.c
+++ b/server.c
-    res = listen(listenfd, 5);
+    res = listen(listenfd, 8192);

https://man7.org/linux/man-pages/man2/listen.2.html


Additionally, updating the client logic with using MSG_MORE feature (linux-only, https://man7.org/linux/man-pages/man2/sendto.2.html) works around the problem because it reduces the load, allowing the server to "catch" begin of data. Which leads to strange thoughts that there is a bug in this operating system protection mechanism. I don't understand why in case of server overload the default behavior is not "reject connection requests until not ready".

5
  • SYN cookies are supposed to be transparent to the application. Is there a bug in the implementation that causes segments to be lost like this?
    – Barmar
    Commented Aug 1 at 18:27
  • As far as I understand, they should be "transparent" for the client. In this case, the server logic breaks down, because it cannot accept the head of the first client message via recv syscall.
    – p5-vbnekit
    Commented Aug 1 at 18:38
  • Another workaround would be to add a delay in the client, so it doesn't look like a syn flood attack?
    – Barmar
    Commented Aug 1 at 18:44
  • yep, he spawn 1k threads and does "connect, 16B msg, disconnect" 100 times for each with zero delay: ./client -t 1000 -c 100 -d 0 -f.
    – p5-vbnekit
    Commented Aug 1 at 18:53
  • 1
    Yes, after disabling tcp_syncookies problem disapeared. For now it's look really as bug in operating system.
    – legden
    Commented Aug 2 at 13:27
2

Reproduced with small count of tcp connections on debian trixie with python3.

#!/usr/bin/env python3

import sys
import time
import socket
import threading
import traceback
import concurrent.futures

assert "__main__" == __name__

_delay = +5.0e+0  # instead press any key
_streams = 16  # or `10` like in @Drepin7 answer
_host, _port = "localhost", 5050

_expected = bytes().join((
    bytes.fromhex("55aa") * 2,
    bytes.fromhex("785634127856341278563412") * 32
))


def _request(key):
    try:
        _data = _expected
        with socket.socket(
            socket.AF_INET, socket.SOCK_STREAM
        ) as _socket:
            # `TCP_NODELAY` not required for reproduce
            # _socket.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1)
            _socket.connect((_host, _port))
            _address = _socket.getsockname()
            _address = _address[1]  # bound port
            key = f"{key}#{_address}"
            print(f"{key} connected", file = sys.stderr, flush = True)
            while _data:
                _socket.sendall(_data[:2])
                _data = _data[2:]

    except BaseException:
        print(
            traceback.format_exc().strip(),
            file = sys.stderr, flush = True
        )
        raise

    finally: return key


def _server():
    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as _socket:
        _socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
        _socket.bind((_host, _port))
        _socket.listen(1)  # backlog = 1
        time.sleep(_delay)

        print(
            f"server started after delay = {_delay}",
            file = sys.stderr, flush = True
        )

        while True:
            _data = bytes()

            _connection, _address = _socket.accept()
            _address = _address[1]  # remote port
            print(f"{_address} accepted", file = sys.stderr, flush = True)

            try:
                with _connection:
                    while len(_expected) > len(_data):
                        _chunk = _connection.recv(len(_expected) - len(_data))
                        if not _chunk: break
                        _data += _chunk
                if _expected == _data: return
                if _expected.endswith(_data): _case = "head case"
                else: _case = "other case"
                print(
                    f"{_case} [{_address}]: {_data.hex()}",
                    file = sys.stdout, flush = True
                )

            except BaseException:
                print(
                    traceback.format_exc().strip(),
                    file = sys.stderr, flush = True
                )
                raise

            finally: continue


threading.Thread(target = _server, daemon = True).start()

with concurrent.futures.ProcessPoolExecutor(max_workers = _streams) as _pool:
    for _client in _pool.map(_request, range(_streams)): print(
        f"request finished: {_client}", flush = True, file = sys.stderr
    )

Same issues with loosing the head of request on server side and

TCP: request_sock_TCP: Possible SYN flooding on port 127.0.0.1:5050. Sending cookies.

Of course, after sudo sysctl net.ipv4.tcp_syncookies=0 it disappeared.

At this point it looks more and more like a kernel bug.

Kernel version: 6.9.10, standard config (original amd64 binaries from debian repo).

1

I have the same behavior (if client runs with -f [--fragments] key) with python3 server implementation and original client in C. And only sequence begin (1st chunk?) is always lost.

#!/usr/bin/env python3

import threading
import socketserver

assert "__main__" == __name__

_mutex = threading.Lock()
_expected = "55aa55aa785634127856341278563412"

class _Handler(socketserver.BaseRequestHandler):
    def handle(self):
        _data = list()
        while True:
            _chunk = self.request.recv(1024)
            if not _chunk: break
            _data.append(_chunk)
        _data = bytes().join(_data).hex()
        if _expected == _data: return
        if _expected.endswith(_data): _case = "head case"
        else: _case = "other case"
        with _mutex: print(f"{_case}: {_data}", flush = True)

class _Server(socketserver.ThreadingMixIn, socketserver.TCPServer): pass

with _Server(("localhost", 5050), _Handler) as _server:
    _server.allow_reuse_address = True
    _server.serve_forever()

Not the answer you're looking for? Browse other questions tagged or ask your own question.