Please note, I already know about the streaming nature of TCP connections; my question is not related to those kinds of things. It rather about my suspicion of there being a bug in the Linux sockets implementation.
Update: Taking comments into account, I updated my code a little bit to check the return value of recv()
not only to -1 but to any negative value. That was just in case. The results are the same.
I have a very simple TCP client/server application written in C. The full code of this project is available on github.
The client side runs multiple parallel threads, and each of threads does the following:
- Open socket
- Connect this socket to server
- Write 16 bytes of a predefined data pattern to the socket by pieces of random length
- Close socket
- Repeat steps 1 to 4 N times
static size_t send_ex(int fd, const uint8_t *buff, size_t len, bool by_frags)
{
if ( by_frags )
{
size_t chunk_len, pos;
size_t res;
for ( pos = 0; pos < len; )
{
chunk_len = (size_t) random();
chunk_len %= (len - pos);
chunk_len++;
res = send(fd, (const char *) &buff[pos], chunk_len, 0);
if ( res != chunk_len) {
return (size_t) -1;
}
pos += chunk_len;
}
return len;
}
return send(fd, buff, len, 0);
}
static void *connection_task(void *arg)
{
connection_ctx_t *ctx = (connection_ctx_t *) arg;
uint32_t buff[4] = {0xAA55AA55, 0x12345678, 0x12345678, 0x12345678};
int res, fd, i;
for ( i = 0; i < count; i++ )
{
fd = socket(AF_INET, SOCK_STREAM, 0);
if ( fd < 0 ) {
fprintf(stderr, "Can't create socket!\n");
break;
}
res = connect(fd, (struct sockaddr *) ctx->serveraddr, sizeof(struct sockaddr_in));
if ( res < 0 ) {
fprintf(stderr, "Connect failed!\n");
close(fd);
break;
}
res = send_ex(fd, (const char *) buff, sizeof(buff), frags);
if ( res != sizeof(buff) ) {
fprintf(stderr, "Send failed!\n");
close(fd);
break;
}
ctx->sent_packs++;
res = close(fd);
if ( res < 0 ) {
fprintf(stderr, "CLI: Close Failed!!\n");
}
msleep(delay);
}
return NULL;
}
The server side runs a thread on each incoming connection, that does the following:
- Read data from the connected socket until it has read all 16 bytes
- After reading at least the first 4 bytes, it is checked that these bytes are equal to a predefined pattern.
typedef struct client_ctx_s {
struct sockaddr_in addr;
int fd;
} client_ctx_t;
void *client_task(void *arg)
{
client_ctx_t *client = (client_ctx_t *) arg;
size_t free_space, pos;
ssize_t chunk_len;
uint32_t buff[4] = {0};
int res;
pos = 0;
while ( pos != sizeof(buff) )
{
free_space = sizeof(buff) - pos;
assert(pos < sizeof(buff));
chunk_len = recv(client->fd, &((uint8_t *) buff)[pos], free_space, 0);
if ( chunk_len <= 0 ) {
if ( chunk_len < 0 ) {
fprintf(stderr, "%s:%u: ERROR: recv failed (errno = %d; pos = %zu)!\n",
inet_ntoa(client->addr.sin_addr),
ntohs(client->addr.sin_port),
errno, pos);
}
else if ( pos && pos < sizeof(buff) ) {
fprintf(stderr, "%s:%u: ERROR: incomplete data block (pos = %zu)!\n",
inet_ntoa(client->addr.sin_addr),
ntohs(client->addr.sin_port),
pos);
}
goto out;
}
assert(chunk_len <= free_space);
pos += chunk_len;
if ( pos >= 4 && buff[0] != 0xAA55AA55) {
fprintf(stderr, "%s:%u: ERROR: data corrupted (%08x)!\n",
inet_ntoa(client->addr.sin_addr),
ntohs(client->addr.sin_port),
buff[0]);
}
}
fprintf(stdout, "%s:%u: %08x %08x %08x %08x\n",
inet_ntoa(client->addr.sin_addr),
ntohs(client->addr.sin_port),
buff[0], buff[1], buff[2], buff[3]);
out:
debug("Connection closed\n");
res = close(client->fd);
assert(res == 0);
free(client);
return NULL;
}
Issues that came up when a client runs one thousand of sending threads, and each of them repeats connect-send-disconnect one hundred times (./client -t 1000 -c 100 -d 0 -f
):
- Loss of first bytes of pattern that was send.
- Total size of data that was readed from socket accordingly less that 16 bytes.
This behavior is repeatable both on local host and over a real network connection.
Examining the TCP flow of the corrupted data with Wireshark shows that:
- There is no issue on client side.
- Corrupted data corresponds data that carried with retransmitted segments of data.
I can't really believe this problem lies in the Linux TCP/IP implementation. Can anybody explain what is wrong with my code?
recv()
. It will return a negativechunk_len
. The correct type isssize_t
, notsize_t
, so it can hold negative numbers.recv()
can return anEWOULDBLOCK
error.