Free Downloads, Community Forum,
FAQs and Developer Resources


Make /Tools Your Home | Link to us

Today's posts | Posts since last visit | Most Active Topics

All Forums Register Login Search Subscriptions My Profile Inbox
Tool Warehouse FAQs Resources Help Member List Address Book Logout

clamd STREAM segfault

 
Logged in as: Guest
Users viewing this topic: none
  Printable Version
All Forums >> [SFU / Interix / SUA Technology] >> Interix Advanced Forum >> clamd STREAM segfault Page: [1] 2   next >   >>
Login
Message << Older Topic   Newer Topic >>
clamd STREAM segfault - Jul. 20, '05, 2:37:02 PM   
breiter

 

Posts: 346
Joined: Jun. 14, '04,
From: Washington, DC
Status: online
I am having a problem with clamd segfualting shortly after scanning using the STREAM protocol over TCP/IP. Clamd can run indefinitely when pointed at directories or files instead of using STREAM.

The difference between the SCAN verb and STREAM verb is in calling scanstream() in clamd/scanner.c instead of scan().

Is there anything in scanstream that looks worrisome?

int scanstream(int odesc, unsigned long int *scanned, const struct cl_node *root, const struct cl_limits *limits, int options, const struct cfgstruct *copt)
{
        int ret, portscan = 1000, sockfd, port = 0, acceptd;
        int tmpd, bread, retval, timeout, btread, min_port, max_port;
        long int size = 0, maxsize = 0;
        short bound = 0, rnd_port_first = 1;
        const char *virname;
        char buff[FILEBUFF];
        struct sockaddr_in server;
        struct hostent *he;
        struct cfgstruct *cpt;
        char *tmpname;


    /* get min port */
    min_port = cfgopt(copt, "StreamMinPort")->numarg;
    if(min_port < 1024 || min_port > 65535)
        min_port = 1024;

    /* get max port */
    max_port = cfgopt(copt, "StreamMaxPort")->numarg;
    if(max_port < min_port || max_port > 65535)
        max_port = 65535;

    /* bind to a free port */
    while(!bound && --portscan) {
        if(rnd_port_first) {
            /* try a random port first */
            port = min_port + cli_rndnum(max_port - min_port + 1);
            rnd_port_first = 0;
        } else {
            /* try the neighbor ports */
            if(--port < min_port)
                port=max_port;
        }

        memset((char *) &server, 0, sizeof(server));
        server.sin_family = AF_INET;
        server.sin_port = htons(port);

        if((cpt = cfgopt(copt, "TCPAddr"))->enabled) {
            pthread_mutex_lock(&gh_mutex);
            if((he = gethostbyname(cpt->strarg)) == 0) {
                logg("!gethostbyname(%s) error: %s\n", cpt->strarg);
                mdprintf(odesc, "gethostbyname(%s) ERROR\n", cpt->strarg);
                pthread_mutex_unlock(&gh_mutex);
                return -1;
            }
            server.sin_addr = *(struct in_addr *) he->h_addr_list[0];
            pthread_mutex_unlock(&gh_mutex);
        } else
            server.sin_addr.s_addr = INADDR_ANY;

        if((sockfd = socket(AF_INET, SOCK_STREAM, 0)) == -1)
            continue;

        if(bind(sockfd, (struct sockaddr *) &server, sizeof(struct sockaddr_in)) == -1)
            close(sockfd);
        else
            bound = 1;
    }

    timeout = cfgopt(copt, "ReadTimeout")->numarg;
    if(timeout == 0)
        timeout = -1;

    if(!bound && !portscan) {
        logg("!ScanStream: Can't find any free port.\n");
        mdprintf(odesc, "Can't find any free port. ERROR\n");
        close(sockfd);
        return -1;
    } else {
        listen(sockfd, 1);
        if(mdprintf(odesc, "PORT %d\n", port) <= 0) {
            logg("!ScanStream: error transmitting port.\n");
            close(sockfd);
            return -1;
        }
    }

    switch(retval = poll_fd(sockfd, timeout)) {
        case 0: /* timeout */
            mdprintf(odesc, "Accept timeout. ERROR\n");
            logg("!ScanStream %d: accept timeout.\n", port);
            close(sockfd);
            return -1;
        case -1:
            mdprintf(odesc, "Accept poll. ERROR\n");
            logg("!ScanStream %d: accept poll failed.\n", port);
            close(sockfd);
            return -1;
    }

    if((acceptd = accept(sockfd, NULL, NULL)) == -1) {
        close(sockfd);
        mdprintf(odesc, "accept() ERROR\n");
        logg("!ScanStream %d: accept() failed.\n", port);
        return -1;
    }

    logg("*Accepted connection on port %d, fd %d\n", port, acceptd);

    if ((tmpname = cli_gentempdesc(NULL, &tmpd)) == NULL) {
        shutdown(sockfd, 2);
        close(sockfd);
        close(acceptd);
        mdprintf(odesc, "tempfile() failed. ERROR\n");
        logg("!ScanStream %d: Can't create temporary file.\n", port);
        return -1;
    }

    maxsize = cfgopt(copt, "StreamMaxLength")->numarg;

    btread = sizeof(buff);

    while((retval = poll_fd(acceptd, timeout)) == 1) {
        bread = read(acceptd, buff, btread);
        if(bread <= 0)
            break;
        size += bread;

        if(writen(tmpd, buff, bread) != bread) {
            shutdown(sockfd, 2);
            close(sockfd);
            close(acceptd);
            mdprintf(odesc, "Temporary file -> write ERROR\n");
            logg("!ScanStream %d: Can't write to temporary file.\n", port);
            close(tmpd);
            if(!cfgopt(copt, "LeaveTemporaryFiles")->enabled)
                unlink(tmpname);
            free(tmpname);
            return -1;
        }

        if(maxsize && (size + btread >= maxsize)) {
            btread = (maxsize - size); /* only read up to max */

            if(btread <= 0) {
                logg("^ScanStream %d: Size limit reached (max: %d)\n", port, maxsize);
                break; /* Scan what we have */
            }
        }
    }

    switch(retval) {
        case 0: /* timeout */
            mdprintf(odesc, "read timeout ERROR\n");
            logg("!ScanStream %d: read timeout.\n", port);
            break;
        case -1:
            mdprintf(odesc, "read poll ERROR\n");
            logg("!ScanStream %d: read poll failed.\n", port);
            break;
    }

    if(retval == 1) {
        lseek(tmpd, 0, SEEK_SET);
        ret = cl_scandesc(tmpd, &virname, scanned, root, limits, options);
    } else {
        ret = -1;
    }
    close(tmpd);
    if(!cfgopt(copt, "LeaveTemporaryFiles")->enabled)
        unlink(tmpname);
    free(tmpname);

    close(acceptd);
    close(sockfd);

    if(ret == CL_VIRUS) {
        mdprintf(odesc, "stream: %s FOUND\n", virname);
        logg("stream %d: %s FOUND\n", port, virname);
        virusaction("stream", virname, copt);
    } else if(ret != CL_CLEAN) {
        if(retval == 1) {
            mdprintf(odesc, "stream: %s ERROR\n", cl_strerror(ret));
            logg("stream %d: %s ERROR\n", port, cl_strerror(ret));
        }
    } else {
        mdprintf(odesc, "stream: OK\n");
        if(logok)
            logg("stream %d: OK\n", port); 
    }

    return ret;
}


Personally, I'm suspicious of poll_fd which is defined in clamd/others.c. Normally, poll_fd calls poll_fds which relies upon poll(3). I already added a ifdef to prevent using poll(3), so it is using an alternate implementation.
/* submitted by breiter@wolfereiter.com: do not use poll(2) on Interix */
#ifdef C_INTERIX
#undef HAVE_POLL
#undef HAVE_POLL_H
#endif

I'm wondering if there is something else subtly wrong with poll_fds.
int poll_fds(int *fds, int nfds, int timeout_sec)
{
        int retval;
        int i;
#ifdef HAVE_POLL
        struct pollfd poll_1[1];
        struct pollfd *poll_data = poll_1;

    if (nfds>1)
        poll_data = malloc(nfds*sizeof(*poll_data));
    for (i=0; i<nfds; i++) {
        poll_data[i].fd = fds[i];
        poll_data[i].events = POLLIN;
        poll_data[i].revents = 0;
    }

    if (timeout_sec > 0) {
        timeout_sec *= 1000;
    }
    while (1) {
        retval = poll(poll_data, nfds, timeout_sec);
        if (retval == -1) {
            if (errno == EINTR) {
                continue;
            }
            if (nfds>1)
                free(poll_data);
            return -1;
        }
        if (nfds>1) {
            if (retval>0) {
                for (i=0; i<nfds; i++) {
                    if (poll_data[i].revents) {
                        retval = i+1;
                        break;
                    }
                }
            }
            free(poll_data);
        }
        return retval;
    }

#else
        fd_set rfds;
        struct timeval tv;
        int maxfd = 0;

    for (i=0; i<nfds; i++) {
        if (fds[i] >= DEFAULT_FD_SETSIZE) {
            return -1;
        }
        if (fds[i] > maxfd)
            maxfd = fds[i];
    }

    while (1) {
        FD_ZERO(&rfds);
        for (i=0; i<nfds; i++)
            FD_SET(fds[i], &rfds);
        tv.tv_sec = timeout_sec;
        tv.tv_usec = 0;

        retval = select(maxfd+1, &rfds, NULL, NULL,
                        (timeout_sec>0 ? &tv : NULL));
        if (retval == -1) {
            if (errno == EINTR) {
                continue;
            }
            return -1;
        }
        if ((nfds>1) && (retval>0)) {
            for (i=0; i<nfds; i++) {
                if (FD_ISSET(fds[i],&rfds)) {
                    retval = i+1;
                    break;
                }
            }
        }
        return retval;
    }
#endif

    return -1;
}

int poll_fd(int fd, int timeout_sec)
{
    return poll_fds(&fd, 1, timeout_sec);
}


I need a little of ye olde Ruddock magik.
Post #: 1
RE: clamd STREAM segfault - Jul. 20, '05, 10:00:37 PM   
Rodney

 

Posts: 3714
Joined: Jul. 9, '02,
From: /Tools lab
Status: offline
> I am having a problem with clamd segfualting

My immediate thought was "gee, this looks like a question for Brian"

> Personally, I'm suspicious of poll_fd which is defined in clamd/others.c

Yes, your suspicion is correct. The API poll() is only good for the /proc filesystem.
For any other filesystem poll() will just not work.

I looked at the code above and didn't see anything at first read.
The I thought it might be the AF_UNIX select() problem, but the code is AF_INET.
So that's not the case.
You should have a core file around that you can get gdb going with. Then
at least you can see where the crash happened ("gdb clamd core" in the directory
the core file is in which should be where clamd was started from).

The only trouble I could see is if you have 4096 open file descriptors. But it's
unlikely your getting that many because select() would have returned an error
at 1025 unless you have made the registry change for maximum open files.
Now I see poll_fd is only calling 1 fd. So very unlikely this is.

Try the gdb thing is my best suggestion at this time.

(in reply to breiter)
Post #: 2
RE: clamd STREAM segfault - Jul. 21, '05, 9:06:35 AM   
breiter

 

Posts: 346
Joined: Jun. 14, '04,
From: Washington, DC
Status: online
I'm no expert with gdb, but it seems like the problem is not with clam itself but is hesiod.c. Is that part of BIND?

When I send my test scans they complete OK and return results. About a second or two later, after an obvious pause, the segv happens.

Anyway. My new suspicion is that BIND9 is causing the segv.

For the record I am ising "bind-9.2.3.6-bin Version 9.2.3.6 of bind for Interix 3.5."

(gdb) run
warning: "_environ": indirect symbol does not have real one (-1)

warning: "_RaiseException@16": indirect symbol does not have real one (-1)

warning: "_beThreadSafe": indirect symbol does not have real one (-1)

warning: "_environ": indirect symbol does not have real one (-1)

warning: "_RaiseException@16": indirect symbol does not have real one (-1)

warning: "_environ": indirect symbol does not have real one (-1)

warning: "_RaiseException@16": indirect symbol does not have real one (-1)

warning: "_environ": indirect symbol does not have real one (-1)

warning: "_RaiseException@16": indirect symbol does not have real one (-1)

warning: "_beThreadSafe": indirect symbol does not have real one (-1)

LibClamAV debug: Loading databases from /usr/local/share/clamav
LibClamAV debug: Loading /usr/local/share/clamav/daily.cvd
LibClamAV debug: in cli_cvdload()
LibClamAV debug: MD5(.tar.gz) = df80bc7c18d325ed710c110d4a5a045b
LibClamAV debug: Decoded signature: df80bc7c18d325ed710c110d4a5a045b
LibClamAV debug: Digital signature is correct.
LibClamAV debug: in cli_untgz()
LibClamAV debug: Unpacking /tmp/clamav-61eb2fd911479405f25544c053c71aeb/COPYING
LibClamAV debug: Unpacking /tmp/clamav-61eb2fd911479405f25544c053c71aeb/daily.db
LibClamAV debug: Unpacking /tmp/clamav-61eb2fd911479405f25544c053c71aeb/daily.hdb
LibClamAV debug: Unpacking /tmp/clamav-61eb2fd911479405f25544c053c71aeb/daily.ndb
LibClamAV debug: Loading databases from /tmp/clamav-61eb2fd911479405f25544c053c71aeb
LibClamAV debug: Loading /tmp/clamav-61eb2fd911479405f25544c053c71aeb/daily.db
LibClamAV debug: Initializing main node
LibClamAV debug: Initializing trie
LibClamAV debug: Initializing BM tables
LibClamAV debug: in cli_bm_init()
LibClamAV debug: BM: Number of indexes = 63744
LibClamAV debug: Loading /tmp/clamav-61eb2fd911479405f25544c053c71aeb/daily.hdb
LibClamAV debug: Initializing md5 list structure
LibClamAV debug: Loading /tmp/clamav-61eb2fd911479405f25544c053c71aeb/daily.ndb
LibClamAV debug: Loading /usr/local/share/clamav/main.cvd
LibClamAV debug: in cli_cvdload()
LibClamAV debug: MD5(.tar.gz) = 7c497735a7e1a3e15dde75832bef48f3
LibClamAV debug: Decoded signature: 7c497735a7e1a3e15dde75832bef48f3
LibClamAV debug: Digital signature is correct.
LibClamAV debug: in cli_untgz()
LibClamAV debug: Unpacking /tmp/clamav-e25b9d174875023fdca46b3e879720cb/COPYING
LibClamAV debug: Unpacking /tmp/clamav-e25b9d174875023fdca46b3e879720cb/main.db
LibClamAV debug: Unpacking /tmp/clamav-e25b9d174875023fdca46b3e879720cb/main.hdb
LibClamAV debug: Unpacking /tmp/clamav-e25b9d174875023fdca46b3e879720cb/main.ndb
LibClamAV debug: Unpacking /tmp/clamav-e25b9d174875023fdca46b3e879720cb/main.zmd
LibClamAV debug: Unpacking /tmp/clamav-e25b9d174875023fdca46b3e879720cb/main.fp
LibClamAV debug: Loading databases from /tmp/clamav-e25b9d174875023fdca46b3e879720cb
LibClamAV debug: Loading /tmp/clamav-e25b9d174875023fdca46b3e879720cb/main.db
LibClamAV debug: Loading /tmp/clamav-e25b9d174875023fdca46b3e879720cb/main.fp
LibClamAV debug: Loading /tmp/clamav-e25b9d174875023fdca46b3e879720cb/main.hdb
LibClamAV debug: Loading /tmp/clamav-e25b9d174875023fdca46b3e879720cb/main.ndb
LibClamAV debug: Loading /tmp/clamav-e25b9d174875023fdca46b3e879720cb/main.zmd
Bound to address 127.0.0.1 on tcp port 3310
Setting connection queue length to 15
Archive: Archived file size limit set to 10485760 bytes.
Archive: Recursion level limit set to 8.
Archive: Files limit set to 1000.
Archive: Compression ratio limit set to 250.
Archive support enabled.
Portable Executable support enabled.
Mail files support enabled.
OLE2 support enabled.
HTML support enabled.
Self checking every 600 seconds.
LibClamAV debug: Recognized RAR file
LibClamAV debug: in scanrar()
LibClamAV debug: in cli_unrar
LibClamAV debug: Head CRC: 90cf
LibClamAV debug: Head Type: 73
LibClamAV debug: Flags: 0000
LibClamAV debug: Head Size: 000d
LibClamAV debug: Found file block.
LibClamAV debug: Pack Size: 290
LibClamAV debug: UnPack Version: 0x14
LibClamAV debug: Pack Method: 0x33
LibClamAV debug: Filename: clam.exe
LibClamAV debug: Expected File CRC: 0xef073cfd
LibClamAV debug: Computed File CRC: 0xef073cfd
LibClamAV debug: Recognized DOS/W32 executable/library/driver file
LibClamAV debug: Calculated MD5 checksum: aa15bcf478d165efd2065190eb473bcb
LibClamAV debug: ClamAV-Test-File found in descriptor 10.
LibClamAV debug: RAR: Exit code: 1
stream 50163: ClamAV-Test-File FOUND
LibClamAV debug: Recognized RAR file
LibClamAV debug: in scanrar()
LibClamAV debug: in cli_unrar
LibClamAV debug: Head CRC: 90cf
LibClamAV debug: Head Type: 73
LibClamAV debug: Flags: 0000
LibClamAV debug: Head Size: 000d
LibClamAV debug: Found file block.
LibClamAV debug: Pack Size: 295
LibClamAV debug: UnPack Version: 0x1d
LibClamAV debug: Pack Method: 0x33
LibClamAV debug: Filename: clam.exe
LibClamAV debug: Offset: 62
LibClamAV debug: Not solid
LibClamAV debug: Read tables
LibClamAV debug: in read_tables Offset=364 in_addr=0 read_top=302
LibClamAV debug: init done
LibClamAV debug: Finished length: 544
LibClamAV debug: Expected File CRC: 0xef073cfd
LibClamAV debug: Computed File CRC: 0xef073cfd
LibClamAV debug: Recognized DOS/W32 executable/library/driver file
LibClamAV debug: Calculated MD5 checksum: aa15bcf478d165efd2065190eb473bcb
LibClamAV debug: ClamAV-Test-File found in descriptor 10.
LibClamAV debug: RAR: Exit code: 1
stream 50118: ClamAV-Test-File FOUND
LibClamAV debug: Recognized MS CAB file
LibClamAV debug: in cli_scanmscab()
LibClamAV debug: MSCAB: Extracting data to /tmp/clamav-4500b38e9541c0231810f5bbb0b2a217
LibClamAV debug: Recognized DOS/W32 executable/library/driver file
LibClamAV debug: Calculated MD5 checksum: aa15bcf478d165efd2065190eb473bcb
LibClamAV debug: ClamAV-Test-File found in descriptor 9.
stream 50167: ClamAV-Test-File FOUND
LibClamAV debug: Recognized DOS/W32 executable/library/driver file
LibClamAV debug: Calculated MD5 checksum: aa15bcf478d165efd2065190eb473bcb
LibClamAV debug: ClamAV-Test-File found in descriptor 8.
stream 50109: ClamAV-Test-File FOUND
LibClamAV debug: Recognized BZip file
LibClamAV debug: Recognized DOS/W32 executable/library/driver file
LibClamAV debug: Calculated MD5 checksum: aa15bcf478d165efd2065190eb473bcb
LibClamAV debug: ClamAV-Test-File found in descriptor 10.
LibClamAV debug: Bzip: Infected with ClamAV-Test-File
stream 50162: ClamAV-Test-File FOUND
LibClamAV debug: Recognized ZIP file
LibClamAV debug: in scanzip()
LibClamAV debug: Zip: clam.exe, crc32: 0xef073cfd, encrypted: 0, compressed: 256, normal: 544, method: 8, ratio: 2 (max: 250)
LibClamAV debug: Recognized DOS/W32 executable/library/driver file
LibClamAV debug: Calculated MD5 checksum: aa15bcf478d165efd2065190eb473bcb
LibClamAV debug: ClamAV-Test-File found in descriptor 10.
LibClamAV debug: Zip: Infected with ClamAV-Test-File
stream 50182: ClamAV-Test-File FOUND
LibClamAV debug: Calculated MD5 checksum: f472b4f5d912f58f1ce64e6f68105758
[New LWP 2]

Program received signal SIGSEGV, Segmentation fault.
[Switching to LWP 2]
0x0049720e in hesiod_end (context=0x113f4d8) at hesiod.c:174
in hesiod.c
(gdb) Quit
(gdb) frame 0
#0 0x0049720e in hesiod_end (context=0x113f4d8) at hesiod.c:174
(gdb) frame 1
#1 0x00489011 in dns_close (this=0x10f922c) at dns.c:149
in dns.c
(gdb) frame 2
#2 0x0047ba43 in gen_close (this=0x10f91a8) at gen.c:245
in gen.c
(gdb) frame 3
#3 0x00477462 in __net_data_destroy (p=0x10f80ec) at irs_data.c:110
in irs_data.c
(gdb) frame 4
#4 0x77ebb49f in ?? () at /tmp/cczLMK6t.s:33
in /tmp/cczLMK6t.s
Current language: auto; currently asm
(gdb) frame 5
#5 0x77ebb7dd in ?? () at /tmp/cczLMK6t.s:33

(in reply to Rodney)
Post #: 3
RE: clamd STREAM segfault - Jul. 21, '05, 11:37:22 AM   
Rodney

 

Posts: 3714
Joined: Jul. 9, '02,
From: /Tools lab
Status: offline
> I'm no expert with gdb, but it seems like the problem is not with clam itself but is hesiod.c. Is that part of BIND?

Yes, the file hesiod.c is part of the BIND source, BUT...

... looking at the short ouput from gdb it may be more complex than that
because it looks like clamd is a threaded program (the "Switching to LWP 2"
is the hint). It being a threaded program can explain why things appear to complete,
pause and then crash. It might be a cleanup thread spun off (worker thread).
Anyway, line 174 in hesiod.c can't SEGV. What has happened is that some thread
has SEGV'd and then gdb switched over to another thread that is still going.
Or more specifically, it looks like the thread that created thread #2 has SEGV'd.

You have to get back to the thread that's crashed and get a stack trace from there.

So, the $0.05 gdb tour for this is once the SEGV happens get the gdb
prompt. For a stack trace enter the command "bt". To see which threads are there do
the "info threads". To switch to a thread do "thread X" where 'X' is the thread number.
The "list" command will list source code (if it can be found). "up" will move
you up the stack while "down" will move you down the stack (the stack is displayed
upside-down when you do "bt"). "print" can o/p a variable's content.

(in reply to breiter)
Post #: 4
RE: clamd STREAM segfault - Jul. 21, '05, 5:37:13 PM   
breiter

 

Posts: 346
Joined: Jun. 14, '04,
From: Washington, DC
Status: online
Yeah. It's threaded. When the clamd starts up (clamd/clamd.c), it creates a socket to listen on (clamd/tcpserver.c) and a loop (clamd/server-th.c) to poll the socket. It also creates a thread pool (clamd/thrmgr.c).

The acceptloop_th() function in clamd/server-th.c contains a for(;;) loop that polls for incoming commands on the main socket. Whenever a command comes in, it is queued to the thread pool. The thread pool has a stack of pointers to the commands and manages executing them.

The stream protocol works like this:

The main socket (tcp 3310) has acceptloop_th listening on it in a polling loop. It creates a thread pool with a pointer to the scanner_thread function. This function is called asynchronously by the thread pool as it dequeues its jobs.

thrmgr_new(max_threads, idletimeout, scanner_thread)

If a client connects, a new client_conn_t structure is created and passed to the thread pool with thrmgr_dispatch().

typedef struct client_conn_tag {
int sd;
int options;
const struct cfgstruct *copt;
struct cl_node *root;
time_t root_timestamp;
const struct cl_limits *limits;
int *socketds;
int nsockets;
} client_conn_t;

The thread pool eventually passes this client_conn_t structure to scanner_thread() as a raw pointer to void which is cast back to client_conn_t as the varaiable conn. scanner_thread calls the command() in session.c. The arguments to command() are most of the pieces of client_conn_t plus a timeout.

command(conn->sd, conn->root, conn->limits, conn->options, conn->copt, timeout)

command() pulls the client's command from the socket and runs through a big if-else-if so that it can call function that the client requested. In this case that is scanstream() in scanner.c.

scanstream opens another port and sends the port number back to the client. The client then connects to the second port and sends a byte stream. scanstream writes the byte stream to a tmp file.

Assuming that nothing has gone wrong to this point (which it doesn't for me), the temp file is passed to cl_scandesc in libclamav. The temp file is then closed and deleted. Next the second socket is closed. Finally the results of the scan are written to the origian socket on tcp 3310 and the scanstream() function returns.

The return code from scanstream() will not be an error so it is ignored by command(). Then command() returns 0.

scanner_thread() falls through a switch statment and a if without executing anything. finally it closes the client connection and frees the memory the structure used. Then it returns void and the thread should end.

In fact, the thrmgr_worker does return NULL. It is just then that the segv happens in "hesiod_end()". So the thread segvs out instead of ending and I'm left with two threads and not one.

I'm having serious trouble figuring out where the segv is happening. gdb seems to completely lose track of the flow of execution after acceptloop_th(). All I get is stuff like "0x7c82ed54 in ?? () at cczLMK6t.s:33".

gdb is referencing a file /tmp/cczLM6t.s. It is a different temp file name every time, but it doesn't actually exist. What does this mean?

At this point I've stepped through execution a number of times. It appears to always segv on this line:

454 new_sd = accept(socketd, NULL, NULL);

It should just sit there forever waiting, right? But it croaks. I think this is because the other thread is not exiting cleanly. However it seems that if command is executed by the threadpool except scanstream(), the thread pool exits and is cleaned up without crashing anything.

I have to say at this point I'm feeling pretty stumped.

(in reply to Rodney)
Post #: 5
RE: clamd STREAM segfault - Jul. 21, '05, 7:44:11 PM   
Rodney

 

Posts: 3714
Joined: Jul. 9, '02,
From: /Tools lab
Status: offline
Welcome to the dark side of threaded programming
First, let me introduce you to a cat. I'm sorry I don't know the cat's name.
But his owner (or rather human companion) is name Schroeder.
Now Schroeder had this thing about putting his cat in a box. He claimed it
had something to do with quantum mechanics (he must've driven a wierd car ).

Threaded programming has a lot in common with the opening of this box of Schroeder's.
The "box" in this case is the program. The "opener" of the box is the debugger (gdb).
The debugger very often will change the nature of is happening, or when it happens.
So you can never rely on the accuracy of the information given. This is not a slight on
gdb, but just the nature of things (I've used several other debuggers on other
OS's with the same difficulties).

In a traditional program you can always see in a nice linear fashion what is happening.
But this a threaded program. The quantum (time each thread gets to run and when) means
things can be happening without your seeing it. So where you see the SEGV is most often
just the after effect. For analogy, you see a car crash and notice one wheel has fallen
off. A conclusion would be the car had some failure with the wheel causing the crash.
Well, the wheel did fall off but it was caused by the bazooka shot by the guy hiding
behind the bushes (where you can't see him). In the hesoid.c code line 174 the action
is to assign a value to a function local variable. The only way for this code to fail
is if something wrote over the data segment of the program. Now you run the debugger
and 'zap' the program is somewhere else. Same problem though. Something has likely
written over a data segment. Depending on how the program gets loaded and run the
data segments may shift, etc. Running the debugger can help cause these shifts.

So rather than looking for a direct effect relationship there must be an indirect one.
After all, recall, all programming can be attributed to a series of side effects.
There are a few ways to approach this. One of the easiest to do is brute force code
elimination (with or without prejudice). You've narrowed down the offending function.
Start "#if 0"'ing blocks of it. Start with the whole dang function as one large "#if 0"
and else a return of 1. Then start exposing sections; you may need to "balance" some
of the blocks. I wouldn't worry about file descriptors leaking or such at this stage.
Looking at the function again I'm kinda suspicious of the call to gethostbyname()
because it's not thread safe. gethostbyname_r() should be used instead. The BIND
man page is "lwres_gethostbyname_r". It's in the BIND library as "gethostbyname_r".
The returned value from gethostbyname() is used outside the lock the code has
during the call. No matter what good intensions exist, outside that lock anyone
can do anything (assuming even that the locking protocol is being followed for
all of the source code; hoping they have a locking protocol!).

Hopefully this helps you move forward.

(in reply to breiter)
Post #: 6
RE: clamd STREAM segfault - Jul. 21, '05, 9:42:38 PM   
breiter

 

Posts: 346
Joined: Jun. 14, '04,
From: Washington, DC
Status: online
Bless Shroedinger and his cat and the box it was crammed in. Even though I think you should have used the Heisenberg Uncertainty Principal metaphor instead of Shroedinger's cat-- Rodney, you are the best.

You managed to hone in on the exact problem. When I #ifdef out the call to gethostbyname() in scanstream() everything works as it should. Threads are spun up and then go away perfectly!

After experiencing the joy of debugging pthreads in DDD, I'm ready to run screaming back to threads in Java and C#.

So all that is left is for me to implement gethostbyname_r() in the place of gethostbyname().

But there is no man page entry!

(in reply to Rodney)
Post #: 7
RE: clamd STREAM segfault - Jul. 22, '05, 12:14:46 AM   
Rodney

 

Posts: 3714
Joined: Jul. 9, '02,
From: /Tools lab
Status: offline
> But there is no man page entry!

At the shell prompt do: "man lwres_gethostbyname_r"
Sorry about this, but I didn't have time to rename all of the man pages without
the "lwres_" prefix. In the BIND library it is just "gethostbyname_r".

> I'm ready to run screaming back to threads in Java and C#

thank the gods you did not say C++

Congrat's on getting it going.
That it hasn't bit someone on another platform already is interesting.

(in reply to breiter)
Post #: 8
RE: clamd STREAM segfault - Jul. 22, '05, 1:35:35 AM   
breiter

 

Posts: 346
Joined: Jun. 14, '04,
From: Washington, DC
Status: online
> Congrat's on getting it going.

Well, I wouldn't say that I have anything going. I just have the problem pretty well defined.

I tried inserting gethostbyname_r() and while I'm not ruling out the possibility that I am not calling gethostbyname_r properly, I can say that it doesn't fix anything using it. Unfortunately I know only enough to be dangerous with C socket programming.

> That it hasn't bit someone on another platform already is interesting.

I think I can explain that. By default, clamd binds to a local socket not a TCP socket. Also by default when you switch to TCP sockets, it allows connections from any IP address.


The original code that causes the segv to occur is in the scanstream function in clamd/scanner.c.
        if((cpt = cfgopt(copt, "TCPAddr"))->enabled) {
        pthread_mutex_lock(&gh_mutex);
            if((he = gethostbyname(cpt->strarg)) == 0) {
                logg("!gethostbyname(%s) error: %s\n", cpt->strarg);
                mdprintf(odesc, "gethostbyname(%s) ERROR\n", cpt->strarg);
                pthread_mutex_unlock(&gh_mutex);
                return -1;
            }
            server.sin_addr = *(struct in_addr *) he->h_addr_list[0];
                pthread_mutex_unlock(&gh_mutex);
        } else
            server.sin_addr.s_addr = INADDR_ANY;

cfgopt() parses the configuration options. So if you don't have TCPAddr set in clamd.conf you accept connections from anywhere, but more importantly the evil code does not get called.

In order to excercise this block of code you have to disable LocalSocket, enable TCPSocket and enable TCPAddr in clamd.conf. That's three manual configurations away from the default settings.

PS.

My call to gethostbyname_r looks something like this:
        interr = 0;
        char c_buff[36];
        he = gethostbyname_r(cpt->strarg, (struct hostent *)mmalloc(sizeof(struct hostent)), c_buff, sizeof(c_buff), &err);

How wrong is that?

< Message edited by breiter -- Jul. 22, '05, 8:39:17 AM >

(in reply to Rodney)
Post #: 9
RE: clamd STREAM segfault - Jul. 22, '05, 2:49:34 AM   
Rodney

 

Posts: 3714
Joined: Jul. 9, '02,
From: /Tools lab
Status: offline
> How wrong is that?

From the man page:
lwres_gethostbyname_r() is  a
thread-safe  function  for   forward
lookups.  If an error occurs, an error code is returned in
*error.  resbuf is a pointer to a struct hostent which  is
initialised  by  a  successful  call  to  lwres_gethostby-
name_r() .  buf is a buffer of length len bytes  which  is
used  to store the h_name, h_aliases, and h_addr_list ele-
ments of the struct hostent returned in resbuf.   Success-
ful  calls to lwres_gethostbyname_r() return resbuf, which
is a pointer to the struct hostent it created.


The key line, here, is: "resbuf is a pointer to a struct hostent which is
initialised by a successful call to lwres_gethostbyname_r()
"

So you don't pass memory for the second arg. You just pass the address of
a 'struct hostent'.

A sizeof 36 for arg #3 (your c_buff) seems kinda small. This is what is used
as the memory chunk for the three 'char *'s in the 'struct hostent'. Cranking
it up to 1024 or higher would be more flexible when a host with a bunch of
aliases and several IP's is found.

So maybe something like the following will work out better:
    char *c_buff;
    size_t c_buff_len=0;
    int err=0;
    do {
        c_buff_len += 1024;
        c_buff = realloc(c_buff, c_buff_len);
        he = gethostbyname_r(cpt->strarg, he, c_buff, c_buff_len, &err);
    } while (he == NULL && errno == ERANGE);
    if (he == NULL) {
        /* the error handling code */
    }


< Message edited by Rodney -- Jul. 22, '05, 2:52:16 AM >

(in reply to breiter)
Post #: 10
RE: clamd STREAM segfault - Jul. 22, '05, 8:38:41 AM   
breiter

 

Posts: 346
Joined: Jun. 14, '04,
From: Washington, DC
Status: online
I had something like that in the first place but it segvs out instantly when gethostbyname_r() is called instead of 30 seconds later when the threadpool spins down the idle worker thread.

So I was convinced I had misunderstood the instructions and was desperately trying just about anything.

[New LWP 2]

Program received signal SIGSEGV, Segmentation fault.
[Switching to LWP 2]
0x00475efa in copy_hostent (he=0x119f1c0, hptr=0x0, buf=0x113a3e0 "(6\024\001x\001\022", buflen=1024) at gethostent_r.c:171
in gethostent_r.c

(in reply to Rodney)
Post #: 11
RE: clamd STREAM segfault - Jul. 22, '05, 12:40:05 PM   
Rodney

 

Posts: 3714
Joined: Jul. 9, '02,
From: /Tools lab
Status: offline
Looking at the code in BIND I find there are two different versions of
gethostbyname_r(). They take different args and are coded differently.
It looks like the "lwres_" version is safer. So try calling this version
instead (lwres_gethostbyname_r()). You'll need to link "-llwres" too.

I'm going to look into why there are two versions so different. Several
months ago I probably knew.

(in reply to breiter)
Post #: 12
RE: clamd STREAM segfault - Jul. 22, '05, 1:30:15 PM   
Rodney

 

Posts: 3714
Joined: Jul. 9, '02,
From: /Tools lab
Status: offline
It looks like you'll want to start the "lwresd" daemon to use the lwres_*() API's
according to the man page "lwres".

(in reply to Rodney)
Post #: 13
RE: clamd STREAM segfault - Jul. 22, '05, 2:05:00 PM   
Rodney

 

Posts: 3714
Joined: Jul. 9, '02,
From: /Tools lab
Status: offline
I'm going through the BIND code right now. There's a lot of it.
Anyway, I'm trying to see why the copy_hostent() would fail.
Gethostbyname_r() is calling gethostybyname() but the internally
used memory looks like it should be thread specific, but to be really
sure I have to follow several header files, etc. So it may be a while.

(in reply to Rodney)
Post #: 14
RE: clamd STREAM segfault - Jul. 22, '05, 2:56:42 PM   
Rodney

 

Posts: 3714
Joined: Jul. 9, '02,
From: /Tools lab
Status: offline
The BIND 9 library has been build for pthreads from the get-go.
I've confirmed that the pthread option is active in the code by
looking at the statics in the archive ('b' and 'd') from nm.
The code reads that for BIND 9 gethostbyname() is thread safe. Even
when the program itself is not using threads, the memory used by
gethostbyname() et al. is on a per thread basis by the way libbind
was built.
But clearly from your testing this is the crucial point.
So now I'm just confused.

(in reply to Rodney)
Post #: 15
RE: clamd STREAM segfault - Jul. 22, '05, 3:16:00 PM   
breiter

 

Posts: 346
Joined: Jun. 14, '04,
From: Washington, DC
Status: online
> So now I'm just confused.

Oh no. I'm afraid to ask this, but is it possible we are excercising a defect in the platform?

I am asking because there was that whole thing with shell scripts segving KSH for no apparent reason.

(in reply to Rodney)
Post #: 16
RE: clamd STREAM segfault - Jul. 22, '05, 3:20:51 PM   
Rodney

 

Posts: 3714
Joined: Jul. 9, '02,
From: /Tools lab
Status: offline
> So now I'm just confused.

Malloc() is thread safe. But BIND is calling memget() to get more memory.
Memget() calls gets a chunk of memory back from malloc() and then parcels
it out (skipping the claims of why memget() used). But memget() and friends
are not thread safe. If two different threads call memget at the same time
when an already allocated memory chunk exists... there is no lock taken
either within the memory functions or at the higher/callers level.

So, my current theory is at least two threads with clamd are making some
BIND API call that goes to get memory from the "chunk". We now hit what
is normally view as the olde DB conflict of updating, but with memory.
So I'm going to try a build here with locking and get back to you (I have
to go out for a short bit).

Funny enough FreeBSD was carrying the metget() et al function in 4.X for
general use (in libc) but seems to have dropped them in 5.X.

(in reply to Rodney)
Post #: 17
RE: clamd STREAM segfault - Jul. 22, '05, 5:14:06 PM   
Rodney

 

Posts: 3714
Joined: Jul. 9, '02,
From: /Tools lab
Status: offline
> Oh no. I'm afraid to ask this, but is it possible we are excercising a defect in the platform?

I don't have access to everything (e.g. the pthread code), so I can't say for sure.
My hope is that it is just this use of memget().

(in reply to Rodney)
Post #: 18
RE: clamd STREAM segfault - Jul. 22, '05, 5:29:18 PM   
Rodney

 

Posts: 3714
Joined: Jul. 9, '02,
From: /Tools lab
Status: offline
Okay, I've placed for Brian a special version of the bind package.
It's in the 3.5/beta directory. I've added some minimal locking. Let's see
if this is enough or ...
Update your bind and then relink clamd.
pkg_add ftp://ftp.interopsystems.com/pkgs/beta/bind-9.2.3.6.1-bin.tgz

(in reply to Rodney)
Post #: 19
RE: clamd STREAM segfault - Jul. 23, '05, 9:09:27 AM   
breiter

 

Posts: 346
Joined: Jun. 14, '04,
From: Washington, DC
Status: online
Got your bind build.

[breiter@prometheus]# pkg_info | egrep bind
bind-9.2.3.6.1-bin Version 9.2.3.6.1 of bind for Interix.
env_ldflags_bind-1.0-bin Version 1.0 of env_ldflags_bind for Interix 3.5.

I don't think that got us anywhere.

Whith gethostbyname_r:
        if((cpt = cfgopt(copt, "TCPAddr"))->enabled) {
            size_t c_buff_len=0;
                char *c_buff;
                int err=0;
                do {
                        c_buff_len += 1024;
                        c_buff = realloc(c_buff, c_buff_len);
                        he = gethostbyname_r(cpt->strarg, he, c_buff, c_buff_len, &err);
                } while (he == NULL && err == TRY_AGAIN);
    if (he == NULL) {
                logg("!gethostbyname_r(%s) error: %s\n", cpt->strarg);
                mdprintf(odesc, "gethostbyname_r(%s) ERROR\n", cpt->strarg);
                return -1;
    }
                server.sin_addr = *(struct in_addr *) he->h_addr_list[0] ;

This still happens:

[New LWP 2]

Program received signal SIGSEGV, Segmentation fault.
[Switching to LWP 2]
0x00475f0a in copy_hostent (he=0x11ff1c0, hptr=0x0, buf=0x119a3e0 "(6\032\001x\001\022", buflen=1024) at gethostent_r.c:171
in gethostent_r.c
(gdb)


When I use gethostbyname:
        if((cpt = cfgopt(copt, "TCPAddr"))->enabled) {
            pthread_mutex_lock(&gh_mutex);
            if((he = gethostbyname(cpt->strarg)) == 0) {
                logg("!gethostbyname(%s) error: %s\n", cpt->strarg);
                mdprintf(odesc, "gethostbyname(%s) ERROR\n", cpt->strarg);
                pthread_mutex_unlock(&gh_mutex);
                return -1;
            }
            server.sin_addr = *(struct in_addr *) he->h_addr_list[0];
                pthread_mutex_unlock(&gh_mutex);

After the worker thread is idle for 30 seconds, this happens as it is spun down:

[New LWP 2]

Program received signal SIGSEGV, Segmentation fault.
[Switching to LWP 2]
0x00496b8e in hesiod_end (context=0x1e8f08) at hesiod.c:174
in hesiod.c
Current language: auto; currently c

(in reply to Rodney)
Post #: 20
Page:   [1] 2   next >   >>
All Forums >> [SFU / Interix / SUA Technology] >> Interix Advanced Forum >> clamd STREAM segfault Page: [1] 2   next >   >>
Jump to:





New Messages No New Messages
Hot Topic w/ New Messages Hot Topic w/o New Messages
Locked w/ New Messages Locked w/o New Messages
 Post New Thread
 Reply to Message
 Post New Poll
 Submit Vote
 Delete My Own Post
 Delete My Own Thread
 Rate Posts


Search All Forums -

Advanced search


SPONSORS



Forum Software © ASPPlayground.NET Advanced Edition 2.5 ANSI

0.125