#include "stdio.h" #include "unistd.h" /* * utf8conv.c - Program to convert UTF-8 encoded files to use * XML entity references to the Unicode characters instead. * * Reads from STDIN, outputs to STDOUT. * * To make: make utf8conv (yes, that simple) * * TODO: Add error checking * Allow it to read/write (multiple)? files * -h usage message * * * DWS - 10/10/01 * */ #define BUFSIZE 2048 char * buf; /* int bget(int handle) * * Reads a byte from the file handle, buffering things so it is reasonably * fast. Should handle most cases of input underrun, but I cannot * gaurantee it. * */ int bget(in) { static i,size; if (!buf) { if ((buf = (char *)malloc (BUFSIZE))==NULL) { fprintf(stderr,"Unable to malloc %d bytes.\n\n", BUFSIZE); exit(1); } i=size=0; } if (i==size) { if ((size = read(in, buf, BUFSIZE)) < 0) { perror("Unable to read: " ); exit(1); } i=0; } if (i= 0) { if (byte < 128) { /* 7 bit ASCII */ putchar(byte); } else { if ((byte >= 192) && (byte <224)) { /* Two byte character */ int word; word = (byte & 31) * 64; word +=( bget (in) & 63); printf ("&#x%0*x;",(word < 0x100) ? 2 : 4 , word); } else if (byte >= 224) { /* Three byte character */ int word; word = (byte & 15) * 4096; word +=( bget (in) & 63) * 64; word +=( bget (in) & 63) ; printf ("&#x%0*x;", (word < 0x100) ? 2 : 4 ,word); } } } return 0; }