	Retrieving documents with HTTP

	This agent conforms to W3A/A. It uses some utility routines
	that are not shown.

	The agents handles http: and proxy: URLs. The latter are like
	HTTP request, except that the URL strat with proxy and the
	path doesn't start with '/' (it starts with `scheme:').

	TO DO: handle content encodings.

	TO DO: fix handling of old HTTP servers; currently their first
	line is lost.

<<*>>=
static char copyright[] = "Copyright NBBI, Den Haag, 1995";
/* Author: Bert Bos <bert@let.rug.nl> */

#define USE_POLL 0

#include <config.h>
#if USE_POLL
#include <poll.h>
#endif
#include <fcntl.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <netdb.h>
#include <pwd.h>				/* To find who we are */
#include <w3a.h>
#include <tcp.h>				/* connectTCP() */
#include <str.h>				/* String and heap functions */
#include <url.h>				/* URL parsing */
#include <mime.h>				/* Read/parse MIME headers */

#ifndef howmany
#define howmany(x,y)  (((x)+((y)-1))/(y))
#endif

#define NRBITS (sizeof(int) * 8)		/* 8 bits per byte */

static struct {
    Bool status_has_been_read;			/* FALSE until header read */
    Bool header_has_been_read;			/* FALSE until header read */
    Bool non_blocking_io;			/* O_NONBLOCK has been set? */
    W3ADocumentInfo info;
    FILE *f;
    char *headerbuf;				/* Temporarily store MIME */
    int headerlen;				/* Length of headerbuf */
#ifndef OLD
    char *query;				/* Query to send to server */
    int querylen, queryoffset;			/* Length/offset in query */
#endif
} *conn_info[FD_SETSIZE];
@

	[[send_HTTP_request]] sends a request to an HTTP server.
	Arguments are: [[f]] = the socket; [[selector]] = the thing
	that is requested; [[method]] = HTTP method to use;
	[[referer]] = URL of document with source of hyperlink.

	An auxiliary function [[send_HTRQ_headers]] sends the MIME
	headers for an HTTP request. The [[From:]] header should give
	the E-mail address of the user. The [[Accept:]] header gives a
	list of accepted file formats. There may be more than one
	Accept: header. [[User-Agent:]] and [[Referer:]] are also sent.

<<*>>=
#define HTTPVERSION "HTTP/1.0"			/* HTTP protocol version */
#define MAXHOSTNAMELEN 256			/* Name of local machine */

static Strip proxy;				/* = "proxy" */


#ifdef OLD
static void send_HTRQ_headers(FILE *f, const char *referer)
{
    W3ABrowserInfo info;			/* Accepted formats */
    int i;

#if 0
    {
	struct passwd *pwent;			/* Info about user */
	struct hostent *phe;			/* Info about localhost */
	char host[MAXHOSTNAMELEN];		/* Name of local machine */

	if ((pwent = getpwuid(getuid()))
	    && (gethostname(host, sizeof(host)) == 0)
	    && (phe = gethostbyname(host)))
	    fprintf(f, "From: %s@%s\r\n", pwent->pw_name, phe->h_name);
    }
#endif /* 0 */

    W3AbrowserInfo(&info);
    for (i = 0; i < info.nformats; i++)
	if (info.preferences[i] == 1.0)
	    fprintf(f, "Accept: %s\015\012", info.formats[i]);
	else
	    fprintf(f, "Accept: %s; q=%f\015\012", info.formats[i],
		    info.preferences[i]);
    fprintf(f, "User-Agent: %s\015\012", info.version);
    if (referer) fprintf(f, "Referer: %s\015\012", referer);
}


static Bool send_HTTP_request(FILE *f, URI uri, int method,
			      const char *referer)
{
    char *path, *search, *meth;

    path = strip2str(uri.path);
    search = uri.search ? strip2str(uri.search) : NULL;

    switch (method) {
    case GET_METHOD: meth = "GET"; break;
    case PUT_METHOD: meth = "PUT"; break;
    case POST_METHOD: meth = "POST"; break;
    case HEAD_METHOD: meth = "HEAD"; break;
    default: errno = EMETHOD; return FALSE;	/* Illegal method */
    }
    fprintf(f, "%s %s%s%s %s\r\n", meth, strip2str(uri.path),
	    search ? "?" : "", search ? search : "", HTTPVERSION);
    send_HTRQ_headers(f, referer);

    if (method != PUT_METHOD && method != POST_METHOD)
	fprintf(f, "\r\n");			/* End of headers */
    fflush(f);					/* Make ready for read */
    return TRUE;
}
#endif /* OLD */
@

	[[send_request]] tries to send the query to server. In the
	case of non-blocking I/O this may fail or succeed only
	partially. Possible return codes are: [[TRUE]] if all of the
	query has been sent and the agent doesn't need to write
	anything more; [[FALSE]] with [[errno = EAGAIN]] if the agent
	still needs to write; [[FALSE]] with any other [[errno]]
	in case of an error.

<<*>>=
#ifndef OLD
static Bool ready_for_write(int fd)
{
#if USE_POLL
    struct pollfd fds[1];

    fds[0].fd = fd;
    fds[0].events = POLLOUT;
    /* return: -1 = err; 0 = timed out; 1 = input available */
    return poll(fds, 1, 0) > 0;
#else /* USE_POLL */
    int mask[howmany(FD_SETSIZE, NRBITS)];
    struct timeval timeout;
    int n, i;

    timeout.tv_sec = 0;
    timeout.tv_usec = 0;
    for (i = 0; i < XtNumber(mask); i++) mask[i] = 0;
    mask[fd/NRBITS] |= 1 << (fd % NRBITS);
    return select(fd + 1, NULL, mask, NULL, &timeout) > 0;
#endif /* USE_POLL */
}


static Bool send_request(int fd)
{
    int n;

    if (conn_info[fd]->queryoffset != conn_info[fd]->querylen
	&& (!conn_info[fd]->non_blocking_io || ready_for_write(fd))) {
	n = write(fd, conn_info[fd]->query + conn_info[fd]->queryoffset,
		  conn_info[fd]->querylen - conn_info[fd]->queryoffset);
	if (n == -1) return FALSE;
	conn_info[fd]->queryoffset += n;
    }
    if (conn_info[fd]->queryoffset != conn_info[fd]->querylen) {
	errno = EAGAIN;
	return FALSE;
    } else {
	return TRUE;
    }
}
#endif /* OLD */
@

	[[create_HTTP_request]] creates the query and MIME headers
	that are to be sent to the remote server. The request is
	stored in [[conn_info[fd]]->query]].

<<*>>=
#ifndef OLD
static Bool create_HTTP_request(int fd, Bool is_proxy, URI uri, int method,
				const char *referer)
{
    char *path, *search, *meth, *s;
    W3ABrowserInfo info;			/* Accepted formats */
    int i, n, len;

    path = strip2str(uri.path);
    if (is_proxy) {
	assert(path[0] == '/');
	path++;
    }
    search = uri.search ? strip2str(uri.search) : NULL;

    switch (method) {
    case GET_METHOD: meth = "GET"; break;
    case PUT_METHOD: meth = "PUT"; break;
    case POST_METHOD: meth = "POST"; break;
    case HEAD_METHOD: meth = "HEAD"; break;
    default: errno = EMETHOD; return FALSE;	/* Illegal method */
    }
    W3AbrowserInfo(&info);
    n = strlen(meth) + strlen(path) + (search ? strlen(search) : 0)
	+ strlen(HTTPVERSION) + strlen(info.version) + strlen(referer) + 50;
    for (i = 0; i < info.nformats; i++)
	n += strlen(info.formats[i]) + (info.preferences[i] == 1.0 ? 10 : 35);

    newarray(s, n);
    len = sprintf(s, "%s %s%s%s %s\r\n", meth, path,
		  search ? "?" : "", search ? search : "", HTTPVERSION);

    for (i = 0; i < info.nformats; i++) {
	if (info.preferences[i] == 1.0)
	    len += sprintf(s + len, "Accept: %s\015\012", info.formats[i]);
	else
	    len += sprintf(s + len, "Accept: %s; q=%f\015\012",
			   info.formats[i], info.preferences[i]);
    }
    len += sprintf(s + len, "User-Agent: %s\015\012", info.version);
    if (referer) len += sprintf(s + len, "Referer: %s\015\012", referer);

    if (method != PUT_METHOD && method != POST_METHOD)
	len += sprintf(s + len, "\r\n");	/* End of headers */

    assert(len < n);				/* Allocated enough space? */
    conn_info[fd]->query = s;
    conn_info[fd]->queryoffset = 0;
    conn_info[fd]->querylen = len;
    return TRUE;
}
#endif /* OLD */
@

	The exported functions are: [[initHTTP]], [[openHTTP]],
	[[readHTTP]], [[writeHTTP]], [[infoHTTP]], [[closeHTTP]],
	[[peekHTTP]], [[doneHHTP]] and [[deleteHTTP]]. Deleting a
	document is not implemented yet.

<<*>>=
EXPORT Bool initHTTP(char ***protocols, int *nrprotocols)
{
    static char *protos[] = {"http", "proxy"};

    *protocols = protos;
    *nrprotocols = XtNumber(protos);
    proxy = str2strip("proxy");
    return TRUE;
}


EXPORT int openHTTP(const char *url, int method, int flags,
		    const char *referer)
{
    URI uri;
    char *host, *port;
    int s;
    FILE *f;
    Bool is_proxy;

    if (! URL_parse(url, &uri)) {
	errno = EURL;				/* Bad URL syntax */
	return -1;
    }

    is_proxy = uri.scheme == proxy;
    port = uri.port ? strip2str(uri.port) : "80";
    host = strip2str(uri.host);
    /* if ((s = connectTCP(host, port, flags & O_NONBLOCK)) == -1 */
    if ((s = connectTCP(host, port, FALSE)) == -1
	&& errno != EINPROGRESS)
	return -1;				/* Could not connect */
    if (!(f = fdopen(s, "r+")))
	return -1;				/* I/O error */
#ifdef OLD
    if (!send_HTTP_request(f, uri, method, referer))
	return -1;				/* Illegal method */
#endif
#if 1
    if ((flags & O_NONBLOCK) != 0 && fcntl(s, F_SETFL, O_NONBLOCK) == -1)
	return -1;				/* I/O error */
#endif

    new(conn_info[s]);
    conn_info[s]->info.url = newstring(url);
    conn_info[s]->info.mime_type = NULL;
    conn_info[s]->info.mime_params = NULL;
    conn_info[s]->info.title = NULL;
    conn_info[s]->info.referer = newstring(referer);
    conn_info[s]->info.status = NULL;
    conn_info[s]->info.location = NULL;
    conn_info[s]->header_has_been_read = FALSE;
    conn_info[s]->status_has_been_read = FALSE;
    conn_info[s]->non_blocking_io = (flags & O_NONBLOCK) != 0;
    conn_info[s]->f = f;
    conn_info[s]->headerbuf = NULL;
    conn_info[s]->headerlen = 0;
#ifndef OLD
    if (! create_HTTP_request(s, is_proxy, uri, method, referer)) return -1;
    if ((flags & O_NONBLOCK) == 0 && ! send_request(s)) return -1;
#endif
    return s;
}



EXPORT Bool doneHTTP(int fd)
{
    assert(0 <= fd && fd < FD_SETSIZE && conn_info[fd]);
    return send_request(fd);
}



EXPORT int peekHTTP(int fd)
{
#if USE_POLL
    struct pollfd fds[1];

    fds[0].fd = fd;
    fds[0].events = POLLNORM;
    /* return: -1 = err; 0 = timed out; 1 = input available */
    return poll(fds, 1, 0);
#else /* USE_POLL */
    int mask[howmany(FD_SETSIZE, NRBITS)];
    struct timeval timeout;
    int n, i;

    timeout.tv_sec = 0;
    timeout.tv_usec = 0;
    for (i = 0; i < XtNumber(mask); i++) mask[i] = 0;
    mask[fd/NRBITS] |= 1 << (fd % NRBITS);
    if ((n = select(fd + 1, mask, NULL, NULL, &timeout)) == -1) return -1;
    return n;					/* 0 or 1 */
#endif /* USE_POLL */
}


/* read_status -- read the server's response code */
static Bool read_status(int s)
{
    char buf[BUFSIZ];
    FILE *f = conn_info[s]->f;
    Bool ok;
    int i;

    ok = fgets(buf, sizeof(buf), f) != NULL;	/* Read status line */
    if (! ok)					/* No data available */
	return errno == EAGAIN;			/* I/O error?*/

    if (! n_eq(buf, "HTTP", 4)) {		/* Old server: HTTP 0.9 */
	conn_info[s]->info.mime_type = newstring("text/html");
	/* Sorry, we loose the first line... */
    } else {					/* HTTP/1.0 or newer */
	for (i = 4; !isspace(buf[i]); i++) ;
	for (; buf[i] && isspace(buf[i]); i++) ;
	conn_info[s]->info.status = newstring(buf + i);
    }
    conn_info[s]->status_has_been_read = TRUE;
    return TRUE;
}


/* interpose_decoder -- start gzip -d -c as a subprocess */
static Bool interpose_decoder(int s)
{
    int fd[2];					/* Pipe */
    long c, tblsiz;

    fflush(conn_info[s]->f);
    if (pipe(fd) == -1) return FALSE;		/* Create pipe */
    switch (fork()) {				/* Fork process */
    case -1: return FALSE;			/* Out of processes */
    case 0:
	/* A subprocess that reads data from the original socket */
	if (conn_info[s]->non_blocking_io)	/* Remove non-blocking IO */
	    if (fcntl(s, F_SETFL, 0) == -1) exit(1);
	if (close(0) == -1 || dup(s) != 0) exit(1);
	if (close(1) == -1 || dup(fd[1]) != 1) exit(1);
	/* if (close(2) == -1 || dup(fd[1]) != 2) exit(1); */
	/* Close all unneeded files */
	tblsiz = sysconf(_SC_OPEN_MAX);
	for (c = 3; c < tblsiz; c++) (void) close(c);
	debug("Started gzip -d -c\n");
	execlp("gzip", "gzip", "-d", "-c", NULL);
	/* NOTREACHED */
    }
    if (close(fd[1]) == -1) return FALSE;	/* I/O Error */
    /* Rename the pipe to s, so that the subprocess becomes `invisible' */
    if (close(s) == -1 || dup2(fd[0], s) == -1) return FALSE;
    if (conn_info[s]->non_blocking_io)		/* Restore non-blocking IO */
	if (fcntl(s, F_SETFL, O_NONBLOCK) == -1) return FALSE;
    return TRUE;
}

/* read_MIME_header -- read the MIME header lines and parse them */
static Bool read_MIME_header(int s)
{
    MIME_header header;
    FILE *f = conn_info[s]->f;
    char line[BUFSIZ];
    int h;

#if 0
    /* TO DO: Something needs to be done to make this non-blocking... */
    read_header(f, &header, NULL);
#else
    do {
	if (! fgets(line, sizeof(line), f)) return FALSE; /* Might be EAGAIN */
	h = conn_info[s]->headerlen + strlen(line);
	renewarray(conn_info[s]->headerbuf, h + 1);
	strcpy(conn_info[s]->headerbuf + conn_info[s]->headerlen, line);
	conn_info[s]->headerlen = h;
    } while (line[0] != '\015' && line[0] != '\012'); /* End of header */
    parse_header(conn_info[s]->headerbuf, &header);
    dispose(conn_info[s]->headerbuf);
#endif
    if (header.head[Location])
	conn_info[s]->info.location = newstring(header.head[Location]);
    if (header.head[Title])
	conn_info[s]->info.title = newstring(header.head[Title]);
    if (header.head[Base])
	conn_info[s]->info.url = newstring(header.head[Base]);
    if (header.head[Content_Length])
	conn_info[s]->info.size = atol(header.head[Content_Length]);
    if (header.head[Content_Type])
	conn_info[s]->info.mime_type = newstring(header.head[Content_Type]);
    else
	conn_info[s]->info.mime_type = newstring("text/html");
    conn_info[s]->header_has_been_read = TRUE;
    /* Something with mime_params, charset?... */

    /* Check for encoding */
    switch (header.content_transfer_encoding) {
    case MIME_8bit:
    case MIME_7bit:
	break;					/* No encoding */
    case MIME_quoted_printable:
    case MIME_base64:
	errno = EFORMAT;			/* Not yet implemented */
	return FALSE;
    case MIME_x_gzip:
	return interpose_decoder(s);		/* Call gzip -d -c */
    default:
	errno = EFORMAT;
	return FALSE;
    }
    return TRUE;
}


EXPORT int readHTTP(int fd, char *buf, size_t nbytes)
{
    assert(conn_info[fd]);
    if (conn_info[fd]->non_blocking_io) {
	if (!conn_info[fd]->status_has_been_read) {
	    if (read_status(fd)) errno = EAGAIN;
	    return -1;
	}
	if (!conn_info[fd]->header_has_been_read) {
	    if (read_MIME_header(fd)) errno = EAGAIN;
	    return -1;
	}
    } else {
	if (!conn_info[fd]->status_has_been_read)
	    if (!read_status(fd)) return -1;
	if (!conn_info[fd]->header_has_been_read)
	    if (!read_MIME_header(fd)) return -1;
    }
    return fread(buf, 1, nbytes, conn_info[fd]->f);
}


EXPORT int writeHTTP(int fd, const char *buf, size_t nbytes)
{
    assert(conn_info[fd]);
    return write(fd, buf, nbytes);
}


EXPORT Bool infoHTTP(int fd, W3ADocumentInfo *buf)
{
    assert(conn_info[fd]);
    if (!conn_info[fd]->status_has_been_read && !read_status(fd))
	return FALSE;
    if (!conn_info[fd]->header_has_been_read && !read_MIME_header(fd))
	return FALSE;
    buf->mime_type = newstring(conn_info[fd]->info.mime_type);
    buf->mime_params = newstring(conn_info[fd]->info.mime_params);
    buf->title = newstring(conn_info[fd]->info.title);
    buf->status = newstring(conn_info[fd]->info.status);
    buf->location = newstring(conn_info[fd]->info.location);
    buf->size = conn_info[fd]->info.size;
    return TRUE;
}


EXPORT Bool closeHTTP(int fd)
{
    int status;

    assert(conn_info[fd]);
    status = fclose(conn_info[fd]->f);
    dispose(conn_info[fd]->headerbuf);
#ifndef OLD
    dispose(conn_info[fd]->query);
#endif
    dispose(conn_info[fd]);
    return status != -1;
}


EXPORT Bool deleteHTTP(const char *url)
{
    errno = ENYI;				/* Not yet implemented */
    return FALSE;
}
