/* vooly.c -- A pure C, fast Vooly parser Copyright (C) 2005 Christian Neukirchen */ #include #include #include #include #include #include "vooly.h" /* Allocate and initialize a new VoolyParser and return a pointer to it. */ static VoolyParser* vooly_new_parser () { VoolyParser *vp = (VoolyParser *) malloc (sizeof (VoolyParser)); if (!vp) return NULL; vp->state = NULL; vp->next_state = VOOLY_TEXT; vp->depths[0] = 2; vp->depth = vp->depths; vp->offsets[0] = 0; vp->offset = vp->offsets; vp->filepos = 0; vp->alloc = VOOLY_DEFAULT_SIZE; vp->size = 0; vp->buffer = (char *) calloc (VOOLY_DEFAULT_SIZE, sizeof (char)); if (!vp->buffer) return NULL; return vp; } /* Return a new VoolyParser that will parse from the FILE* IO and perform whitespace stripping according to enum VoolyStrip STRIP (see vooly.h for the modes available). Returns NULL if there is not enough memory available. */ VoolyParser * vooly_new_io_parser(FILE *io, enum VoolyStrip strip) { VoolyParser *vp = vooly_new_parser (); if (!vp) return NULL; vp->style = VOOLY_IO_PARSER; vp->source.io = io; vp->strip = strip; return vp; } /* Return a new VoolyParser that will parse from the char* STRING sized SIZE and perform whitespace stripping according to enum VoolyStrip STRIP (see vooly.h for the modes available). Returns NULL if there is not enough memory available. */ VoolyParser * vooly_new_str_parser(char *string, size_t length, enum VoolyStrip strip) { VoolyParser *vp = vooly_new_parser (); if (!vp) return NULL; vp->style = VOOLY_STR_PARSER; vp->source.str.string = string; vp->source.str.last = string + length; vp->strip = strip; return vp; } /* Free a VoolyParser VP created by vooly_new_parser. */ void vooly_free_parser (VoolyParser *vp) { free (vp->buffer); free (vp); } /* Clear and reset the vp->text field. */ inline static void vooly_clear_text (VoolyParser *vp) { /* This is not really needed, but we do it for the convenience for people parsing input without null bytes and to ease debugging. */ bzero (vp->buffer, vp->alloc); vp->text = vp->buffer; vp->size = 0; } /* Strip whitespace of vp->text according to vp->strip, see vooly.h. */ static void vooly_strip (VoolyParser *vp) { switch (vp->strip) { case VOOLY_STRIP_PURE: { int i; for (i = 0; i < vp->size; i++) if (!isspace (vp->text[i])) return; /* Don't strip anything. */ vp->size = 0; /* Strip everything. */ break; } case VOOLY_STRIP_ALL: while (isspace (*vp->text)) vp->text++, vp->size--; while (isspace (vp->text[vp->size-1]) && vp->size > 0) vp->text[vp->size-1] = 0, vp->size--; case VOOLY_STRIP_NEVER: ; } } /* Mark VoolyParser VP as erroneous and sprintf the MESSAGE to the text field. */ static void vooly_error(VoolyParser *vp, char *message, ...) { va_list ap; vp->state = VOOLY_ERROR; vooly_clear_text (vp); va_start (ap, message); vp->size = vsnprintf (vp->text, vp->alloc, message, ap); va_end (ap); } /* Append the char C to the VoolyParser VP, checking for the buffer size and possibly resizing it. Returns 0 if there is no more memory available. */ static inline int vooly_append_char (VoolyParser *vp, char c) { /* Resize if needed. */ if (vp->size >= vp->alloc) { vp->alloc *= 2; vp->text = vp->buffer = realloc (vp->buffer, vp->alloc); if (!vp->buffer) return 0; } vp->text[vp->size++] = c; return 1; } /* Read a char from source and return it. Returns EOF at end of file. */ static inline int vooly_getc (VoolyParser *vp) { vp->filepos++; switch (vp->style) { case VOOLY_IO_PARSER: return getc (vp->source.io); case VOOLY_STR_PARSER: if (vp->source.str.last > vp->source.str.string) return *vp->source.str.string++; } return EOF; } /* Put char C back into the reading buffer of the source. */ static inline void vooly_ungetc (VoolyParser *vp, char c) { vp->filepos--; switch (vp->style) { case VOOLY_IO_PARSER: ungetc (c, vp->source.io); break; case VOOLY_STR_PARSER: vp->source.str.string--; } } /* These macros hopefully clear up the issues with pull parsers... */ #define AGAIN return 1 #define DONE return 0 /* Parse one token using the VoolyParser VP. Will set text, size and state of the parser VP. Returns 1 if it wants to be called again, and 0 if there is nothing more to parse, either because of errors or EOF. */ int vooly_next(VoolyParser *vp) { /* The current char. */ int c; /* Number of continuous opening and closing braces. */ int open = 0, close = 0; tailcall: /* Thank you, Scheme. */ if (vp->next_state) { vp->state = vp->next_state; /* If the state changes, the text becomes invalid. */ vooly_clear_text (vp); } while ((c = vooly_getc (vp)) != EOF) { switch (vp->state) { case VOOLY_TEXT: if (c == '<') open++, close = 0; else if (c == '>') close++, open = 0; if (c != '<' && open >= *vp->depth) { vooly_ungetc (vp, c); if (vp->depth - vp->depths >= VOOLY_MAXDEPTH - 1) { vooly_error (vp, "Excessive depth in document " \ "(more than %d at offset %d)", VOOLY_MAXDEPTH, vp->offset[-1]); DONE; } /* Save previous depth and offset. */ vp->depth++; *vp->depth = open; vp->offset++; *vp->offset = vp->filepos - open; vp->next_state = VOOLY_OPEN; /* Truncate the read braces. */ vp->size -= open; vp->text[vp->size] = 0; vooly_strip (vp); if (vp->size <= 0) /* No text? Straight go on. */ goto tailcall; else AGAIN; } else if (close == *vp->depth) { if (vp->depths >= vp->depth) { vooly_error (vp, "Too many closing tags at offset %d", vp->filepos - close); DONE; } /* Restore previous depth and offset. */ vp->depth--; vp->offset--; vp->next_state = VOOLY_CLOSE; vp->size -= close - 1; /* Drop optional final whitespace. */ if (vp->strip == VOOLY_STRIP_PURE && isspace(vp->text[vp->size - 1])) vp->size--; /* Truncate the read braces. */ vp->text[vp->size] = 0; vooly_strip (vp); if (vp->size <= 0) /* No text? Straight go on. */ goto tailcall; else AGAIN; } else { if (!vooly_append_char (vp, c)) { vooly_error (vp, "Out of memory"); DONE; } if (c != '<' && c != '>') open = close = 0; } break; case VOOLY_OPEN: /* Read the tag. */ if (isalnum (c)) { if (!vooly_append_char (vp, c)) { vooly_error (vp, "Out of memory"); DONE; } } else { if (!(vp->strip == VOOLY_STRIP_PURE && isspace(c))) vooly_ungetc (vp, c); if (vp->size <= 0) vp->text = NULL; vp->next_state = VOOLY_TEXT; AGAIN; } break; case VOOLY_CLOSE: vooly_ungetc (vp, c); vp->text = NULL; vp->next_state = VOOLY_TEXT; AGAIN; break; case VOOLY_ERROR: case VOOLY_EOF: /* Just to be safe. */ vp->text = NULL; DONE; } } /* At EOF. */ vp->next_state = VOOLY_EOF; /* Is there still text to flush? */ if (vp->state == VOOLY_TEXT) { vooly_strip (vp); if (vp->size > 0) AGAIN; } /* Is there still a tag to close? */ if (vp->state == VOOLY_CLOSE) { vp->text = NULL; AGAIN; } if (vp->depths != vp->depth) { vooly_error (vp, "Permature end of data, last open tag started at offset %d", *vp->offset); DONE; } vp->state = VOOLY_EOF; vp->text = NULL; DONE; } #undef AGAIN #undef DONE #ifdef TEST int main(int argc, char **argv) { enum VoolyStrip strip = VOOLY_STRIP_PURE; if (argc > 1) switch(*argv[1]) { case 'P': strip = VOOLY_STRIP_PURE; break; case 'A': strip = VOOLY_STRIP_ALL; break; case 'N': strip = VOOLY_STRIP_NEVER; break; } VoolyParser *vp = vooly_new_io_parser (stdin, strip); while (vooly_next (vp)) printf("%c: '%s'\n", vooly_state(vp), vooly_text(vp)); if (vooly_state(vp) == VOOLY_ERROR) printf ("Error: (%d): %s.\n", vooly_state(vp), vooly_text(vp)); vooly_next (vp); printf("%c: '%s'\n", vooly_state(vp), vooly_text(vp)); vooly_free_parser (vp); return 0; } #endif