642 lines
14 KiB
C
642 lines
14 KiB
C
/* iksemel (XML parser for Jabber)
|
|
** Copyright (C) 2000-2004 Gurer Ozen <madcat@e-kolay.net>
|
|
** This code is free software; you can redistribute it and/or
|
|
** modify it under the terms of GNU Lesser General Public License.
|
|
*/
|
|
|
|
#include "common.h"
|
|
#include "iksemel.h"
|
|
|
|
enum cons_e {
|
|
C_CDATA = 0,
|
|
C_TAG_START,
|
|
C_TAG,
|
|
C_TAG_END,
|
|
C_ATTRIBUTE,
|
|
C_ATTRIBUTE_1,
|
|
C_ATTRIBUTE_2,
|
|
C_VALUE,
|
|
C_VALUE_APOS,
|
|
C_VALUE_QUOT,
|
|
C_WHITESPACE,
|
|
C_ENTITY,
|
|
C_COMMENT,
|
|
C_COMMENT_1,
|
|
C_COMMENT_2,
|
|
C_COMMENT_3,
|
|
C_MARKUP,
|
|
C_MARKUP_1,
|
|
C_SECT,
|
|
C_SECT_CDATA,
|
|
C_SECT_CDATA_1,
|
|
C_SECT_CDATA_2,
|
|
C_SECT_CDATA_3,
|
|
C_SECT_CDATA_4,
|
|
C_SECT_CDATA_C,
|
|
C_SECT_CDATA_E,
|
|
C_SECT_CDATA_E2,
|
|
C_PI
|
|
};
|
|
|
|
/* if you add a variable here, dont forget changing iks_parser_reset */
|
|
struct iksparser_struct {
|
|
ikstack *s;
|
|
void *user_data;
|
|
iksTagHook *tagHook;
|
|
iksCDataHook *cdataHook;
|
|
iksDeleteHook *deleteHook;
|
|
/* parser context */
|
|
char *stack;
|
|
size_t stack_pos;
|
|
size_t stack_max;
|
|
|
|
enum cons_e context;
|
|
enum cons_e oldcontext;
|
|
|
|
char *tag_name;
|
|
enum ikstagtype tagtype;
|
|
|
|
unsigned int attmax;
|
|
unsigned int attcur;
|
|
int attflag;
|
|
char **atts;
|
|
int valflag;
|
|
|
|
unsigned int entpos;
|
|
char entity[8];
|
|
|
|
unsigned long nr_bytes;
|
|
unsigned long nr_lines;
|
|
|
|
int uni_max;
|
|
int uni_len;
|
|
};
|
|
|
|
iksparser *
|
|
iks_sax_new (void *user_data, iksTagHook *tagHook, iksCDataHook *cdataHook)
|
|
{
|
|
iksparser *prs;
|
|
|
|
prs = iks_malloc (sizeof (iksparser));
|
|
if (NULL == prs) return NULL;
|
|
memset (prs, 0, sizeof (iksparser));
|
|
prs->user_data = user_data;
|
|
prs->tagHook = tagHook;
|
|
prs->cdataHook = cdataHook;
|
|
return prs;
|
|
}
|
|
|
|
iksparser *
|
|
iks_sax_extend (ikstack *s, void *user_data, iksTagHook *tagHook, iksCDataHook *cdataHook, iksDeleteHook *deleteHook)
|
|
{
|
|
iksparser *prs;
|
|
|
|
prs = iks_stack_alloc (s, sizeof (iksparser));
|
|
if (NULL == prs) return NULL;
|
|
memset (prs, 0, sizeof (iksparser));
|
|
prs->s = s;
|
|
prs->user_data = user_data;
|
|
prs->tagHook = tagHook;
|
|
prs->cdataHook = cdataHook;
|
|
prs->deleteHook = deleteHook;
|
|
return prs;
|
|
}
|
|
|
|
ikstack *
|
|
iks_parser_stack (iksparser *prs)
|
|
{
|
|
return prs->s;
|
|
}
|
|
|
|
void *
|
|
iks_user_data (iksparser *prs)
|
|
{
|
|
return prs->user_data;
|
|
}
|
|
|
|
unsigned long
|
|
iks_nr_bytes (iksparser *prs)
|
|
{
|
|
return prs->nr_bytes;
|
|
}
|
|
|
|
unsigned long
|
|
iks_nr_lines (iksparser *prs)
|
|
{
|
|
return prs->nr_lines;
|
|
}
|
|
|
|
#define IS_WHITESPACE(x) ' ' == (x) || '\t' == (x) || '\r' == (x) || '\n' == (x)
|
|
#define NOT_WHITESPACE(x) ' ' != (x) && '\t' != (x) && '\r' != (x) && '\n' != (x)
|
|
|
|
static int
|
|
stack_init (iksparser *prs)
|
|
{
|
|
prs->stack = iks_malloc (128);
|
|
if (!prs->stack) return 0;
|
|
prs->stack_max = 128;
|
|
prs->stack_pos = 0;
|
|
return 1;
|
|
}
|
|
|
|
static int
|
|
stack_expand (iksparser *prs, int len)
|
|
{
|
|
size_t need;
|
|
off_t diff;
|
|
char *tmp;
|
|
need = len - (prs->stack_max - prs->stack_pos);
|
|
if (need < prs->stack_max) {
|
|
need = prs->stack_max * 2;
|
|
} else {
|
|
/* need x 1.2 for integer only archs like ARM */
|
|
need = prs->stack_max + ( (need * 6) / 5);
|
|
}
|
|
tmp = iks_malloc (need);
|
|
if (!tmp) return 0;
|
|
diff = tmp - prs->stack;
|
|
memcpy (tmp, prs->stack, prs->stack_max);
|
|
iks_free (prs->stack);
|
|
prs->stack = tmp;
|
|
prs->stack_max = need;
|
|
prs->tag_name += diff;
|
|
if (prs->attflag != 0) {
|
|
int i = 0;
|
|
while (i < (prs->attmax * 2)) {
|
|
if (prs->atts[i]) prs->atts[i] += diff;
|
|
i++;
|
|
}
|
|
}
|
|
return 1;
|
|
}
|
|
|
|
#define STACK_INIT \
|
|
if (NULL == prs->stack && 0 == stack_init (prs)) return IKS_NOMEM
|
|
|
|
#define STACK_PUSH_START (prs->stack + prs->stack_pos)
|
|
|
|
#define STACK_PUSH(buf,len) \
|
|
{ \
|
|
char *sbuf = (buf); \
|
|
size_t slen = (len); \
|
|
if (prs->stack_max - prs->stack_pos <= slen) { \
|
|
if (0 == stack_expand (prs, slen)) return IKS_NOMEM; \
|
|
} \
|
|
memcpy (prs->stack + prs->stack_pos, sbuf, slen); \
|
|
prs->stack_pos += slen; \
|
|
}
|
|
|
|
#define STACK_PUSH_END \
|
|
{ \
|
|
if (prs->stack_pos >= prs->stack_max) { \
|
|
if (0 == stack_expand (prs, 1)) return IKS_NOMEM; \
|
|
} \
|
|
prs->stack[prs->stack_pos] = '\0'; \
|
|
prs->stack_pos++; \
|
|
}
|
|
|
|
static enum ikserror
|
|
sax_core (iksparser *prs, char *buf, int len)
|
|
{
|
|
enum ikserror err;
|
|
int pos = 0, old = 0, re, stack_old = -1;
|
|
unsigned char c;
|
|
|
|
while (pos < len) {
|
|
re = 0;
|
|
c = buf[pos];
|
|
if (0 == c || 0xFE == c || 0xFF == c) return IKS_BADXML;
|
|
if (prs->uni_max) {
|
|
if ((c & 0xC0) != 0x80) return IKS_BADXML;
|
|
prs->uni_len++;
|
|
if (prs->uni_len == prs->uni_max) prs->uni_max = 0;
|
|
goto cont;
|
|
} else {
|
|
if (c & 0x80) {
|
|
unsigned char mask;
|
|
if ((c & 0x60) == 0x40) {
|
|
prs->uni_max = 2;
|
|
mask = 0x1F;
|
|
} else if ((c & 0x70) == 0x60) {
|
|
prs->uni_max = 3;
|
|
mask = 0x0F;
|
|
} else if ((c & 0x78) == 0x70) {
|
|
prs->uni_max = 4;
|
|
mask = 0x07;
|
|
} else if ((c & 0x7C) == 0x78) {
|
|
prs->uni_max = 5;
|
|
mask = 0x03;
|
|
} else if ((c & 0x7E) == 0x7C) {
|
|
prs->uni_max = 6;
|
|
mask = 0x01;
|
|
} else {
|
|
return IKS_BADXML;
|
|
}
|
|
/* if ((c & mask) == 0) return IKS_BADXML; I AM WRONG */
|
|
prs->uni_len = 1;
|
|
if (stack_old == -1
|
|
&& (prs->context == C_TAG
|
|
|| prs->context == C_ATTRIBUTE_1
|
|
|| prs->context == C_VALUE_APOS
|
|
|| prs->context == C_VALUE_QUOT)) stack_old = pos;
|
|
goto cont;
|
|
}
|
|
}
|
|
|
|
switch (prs->context) {
|
|
case C_CDATA:
|
|
if ('&' == c) {
|
|
if (old < pos && prs->cdataHook) {
|
|
err = prs->cdataHook (prs->user_data, &buf[old], pos - old);
|
|
if (IKS_OK != err) return err;
|
|
}
|
|
prs->context = C_ENTITY;
|
|
prs->entpos = 0;
|
|
break;
|
|
}
|
|
if ('<' == c) {
|
|
if (old < pos && prs->cdataHook) {
|
|
err = prs->cdataHook (prs->user_data, &buf[old], pos - old);
|
|
if (IKS_OK != err) return err;
|
|
}
|
|
STACK_INIT;
|
|
prs->tag_name = STACK_PUSH_START;
|
|
if (!prs->tag_name) return IKS_NOMEM;
|
|
prs->context = C_TAG_START;
|
|
}
|
|
break;
|
|
|
|
case C_TAG_START:
|
|
prs->context = C_TAG;
|
|
if ('/' == c) {
|
|
prs->tagtype = IKS_CLOSE;
|
|
break;
|
|
}
|
|
if ('?' == c) {
|
|
prs->context = C_PI;
|
|
break;
|
|
}
|
|
if ('!' == c) {
|
|
prs->context = C_MARKUP;
|
|
break;
|
|
}
|
|
prs->tagtype = IKS_OPEN;
|
|
stack_old = pos;
|
|
break;
|
|
|
|
case C_TAG:
|
|
if (IS_WHITESPACE(c)) {
|
|
if (IKS_CLOSE == prs->tagtype)
|
|
prs->oldcontext = C_TAG_END;
|
|
else
|
|
prs->oldcontext = C_ATTRIBUTE;
|
|
prs->context = C_WHITESPACE;
|
|
if (stack_old != -1) STACK_PUSH (buf + stack_old, pos - stack_old);
|
|
stack_old = -1;
|
|
STACK_PUSH_END;
|
|
break;
|
|
}
|
|
if ('/' == c) {
|
|
if (IKS_CLOSE == prs->tagtype) return IKS_BADXML;
|
|
prs->tagtype = IKS_SINGLE;
|
|
prs->context = C_TAG_END;
|
|
if (stack_old != -1) STACK_PUSH (buf + stack_old, pos - stack_old);
|
|
stack_old = -1;
|
|
STACK_PUSH_END;
|
|
break;
|
|
}
|
|
if ('>' == c) {
|
|
prs->context = C_TAG_END;
|
|
if (stack_old != -1) STACK_PUSH (buf + stack_old, pos - stack_old);
|
|
stack_old = -1;
|
|
STACK_PUSH_END;
|
|
re = 1;
|
|
break;
|
|
}
|
|
if (stack_old == -1) stack_old = pos;
|
|
break;
|
|
|
|
case C_TAG_END:
|
|
if (c != '>') return IKS_BADXML;
|
|
if (prs->tagHook) {
|
|
char **tmp;
|
|
if (prs->attcur == 0) tmp = NULL; else tmp = prs->atts;
|
|
err = prs->tagHook (prs->user_data, prs->tag_name, tmp, prs->tagtype);
|
|
if (IKS_OK != err) return err;
|
|
}
|
|
prs->stack_pos = 0;
|
|
stack_old = -1;
|
|
prs->attcur = 0;
|
|
prs->attflag = 0;
|
|
prs->context = C_CDATA;
|
|
old = pos + 1;
|
|
break;
|
|
|
|
case C_ATTRIBUTE:
|
|
if ('/' == c) {
|
|
prs->tagtype = IKS_SINGLE;
|
|
prs->context = C_TAG_END;
|
|
break;
|
|
}
|
|
if ('>' == c) {
|
|
prs->context = C_TAG_END;
|
|
re = 1;
|
|
break;
|
|
}
|
|
if (!prs->atts) {
|
|
prs->attmax = 12;
|
|
prs->atts = iks_malloc (sizeof(char *) * 2 * 12);
|
|
if (!prs->atts) return IKS_NOMEM;
|
|
memset (prs->atts, 0, sizeof(char *) * 2 * 12);
|
|
prs->attcur = 0;
|
|
} else {
|
|
if (prs->attcur >= (prs->attmax * 2)) {
|
|
void *tmp;
|
|
prs->attmax += 12;
|
|
tmp = iks_malloc (sizeof(char *) * 2 * prs->attmax);
|
|
if (!tmp) return IKS_NOMEM;
|
|
memset (tmp, 0, sizeof(char *) * 2 * prs->attmax);
|
|
memcpy (tmp, prs->atts, sizeof(char *) * prs->attcur);
|
|
free (prs->atts);
|
|
prs->atts = tmp;
|
|
}
|
|
}
|
|
prs->attflag = 1;
|
|
prs->atts[prs->attcur] = STACK_PUSH_START;
|
|
stack_old = pos;
|
|
prs->context = C_ATTRIBUTE_1;
|
|
break;
|
|
|
|
case C_ATTRIBUTE_1:
|
|
if ('=' == c) {
|
|
if (stack_old != -1) STACK_PUSH (buf + stack_old, pos - stack_old);
|
|
stack_old = -1;
|
|
STACK_PUSH_END;
|
|
prs->context = C_VALUE;
|
|
break;
|
|
}
|
|
if (stack_old == -1) stack_old = pos;
|
|
break;
|
|
|
|
case C_ATTRIBUTE_2:
|
|
if ('/' == c) {
|
|
prs->tagtype = IKS_SINGLE;
|
|
prs->atts[prs->attcur] = NULL;
|
|
prs->context = C_TAG_END;
|
|
break;
|
|
}
|
|
if ('>' == c) {
|
|
prs->atts[prs->attcur] = NULL;
|
|
prs->context = C_TAG_END;
|
|
re = 1;
|
|
break;
|
|
}
|
|
prs->context = C_ATTRIBUTE;
|
|
re = 1;
|
|
break;
|
|
|
|
case C_VALUE:
|
|
prs->atts[prs->attcur + 1] = STACK_PUSH_START;
|
|
if ('\'' == c) {
|
|
prs->context = C_VALUE_APOS;
|
|
break;
|
|
}
|
|
if ('"' == c) {
|
|
prs->context = C_VALUE_QUOT;
|
|
break;
|
|
}
|
|
return IKS_BADXML;
|
|
|
|
case C_VALUE_APOS:
|
|
if ('\'' == c) {
|
|
if (stack_old != -1) STACK_PUSH (buf + stack_old, pos - stack_old);
|
|
stack_old = -1;
|
|
STACK_PUSH_END;
|
|
prs->oldcontext = C_ATTRIBUTE_2;
|
|
prs->context = C_WHITESPACE;
|
|
prs->attcur += 2;
|
|
}
|
|
if (stack_old == -1) stack_old = pos;
|
|
break;
|
|
|
|
case C_VALUE_QUOT:
|
|
if ('"' == c) {
|
|
if (stack_old != -1) STACK_PUSH (buf + stack_old, pos - stack_old);
|
|
stack_old = -1;
|
|
STACK_PUSH_END;
|
|
prs->oldcontext = C_ATTRIBUTE_2;
|
|
prs->context = C_WHITESPACE;
|
|
prs->attcur += 2;
|
|
}
|
|
if (stack_old == -1) stack_old = pos;
|
|
break;
|
|
|
|
case C_WHITESPACE:
|
|
if (NOT_WHITESPACE(c)) {
|
|
prs->context = prs->oldcontext;
|
|
re = 1;
|
|
}
|
|
break;
|
|
|
|
case C_ENTITY:
|
|
if (';' == c) {
|
|
char hede[2];
|
|
char t = '?';
|
|
prs->entity[prs->entpos] = '\0';
|
|
if (strcmp(prs->entity, "amp") == 0)
|
|
t = '&';
|
|
else if (strcmp(prs->entity, "quot") == 0)
|
|
t = '"';
|
|
else if (strcmp(prs->entity, "apos") == 0)
|
|
t = '\'';
|
|
else if (strcmp(prs->entity, "lt") == 0)
|
|
t = '<';
|
|
else if (strcmp(prs->entity, "gt") == 0)
|
|
t = '>';
|
|
old = pos + 1;
|
|
hede[0] = t;
|
|
if (prs->cdataHook) {
|
|
err = prs->cdataHook (prs->user_data, &hede[0], 1);
|
|
if (IKS_OK != err) return err;
|
|
}
|
|
prs->context = C_CDATA;
|
|
} else {
|
|
prs->entity[prs->entpos++] = buf[pos];
|
|
if (prs->entpos > 7) return IKS_BADXML;
|
|
}
|
|
break;
|
|
|
|
case C_COMMENT:
|
|
if ('-' != c) return IKS_BADXML;
|
|
prs->context = C_COMMENT_1;
|
|
break;
|
|
|
|
case C_COMMENT_1:
|
|
if ('-' == c) prs->context = C_COMMENT_2;
|
|
break;
|
|
|
|
case C_COMMENT_2:
|
|
if ('-' == c)
|
|
prs->context = C_COMMENT_3;
|
|
else
|
|
prs->context = C_COMMENT_1;
|
|
break;
|
|
|
|
case C_COMMENT_3:
|
|
if ('>' != c) return IKS_BADXML;
|
|
prs->context = C_CDATA;
|
|
old = pos + 1;
|
|
break;
|
|
|
|
case C_MARKUP:
|
|
if ('[' == c) {
|
|
prs->context = C_SECT;
|
|
break;
|
|
}
|
|
if ('-' == c) {
|
|
prs->context = C_COMMENT;
|
|
break;
|
|
}
|
|
prs->context = C_MARKUP_1;
|
|
|
|
case C_MARKUP_1:
|
|
if ('>' == c) {
|
|
old = pos + 1;
|
|
prs->context = C_CDATA;
|
|
}
|
|
break;
|
|
|
|
case C_SECT:
|
|
if ('C' == c) {
|
|
prs->context = C_SECT_CDATA;
|
|
break;
|
|
}
|
|
return IKS_BADXML;
|
|
|
|
case C_SECT_CDATA:
|
|
if ('D' != c) return IKS_BADXML;
|
|
prs->context = C_SECT_CDATA_1;
|
|
break;
|
|
|
|
case C_SECT_CDATA_1:
|
|
if ('A' != c) return IKS_BADXML;
|
|
prs->context = C_SECT_CDATA_2;
|
|
break;
|
|
|
|
case C_SECT_CDATA_2:
|
|
if ('T' != c) return IKS_BADXML;
|
|
prs->context = C_SECT_CDATA_3;
|
|
break;
|
|
|
|
case C_SECT_CDATA_3:
|
|
if ('A' != c) return IKS_BADXML;
|
|
prs->context = C_SECT_CDATA_4;
|
|
break;
|
|
|
|
case C_SECT_CDATA_4:
|
|
if ('[' != c) return IKS_BADXML;
|
|
old = pos + 1;
|
|
prs->context = C_SECT_CDATA_C;
|
|
break;
|
|
|
|
case C_SECT_CDATA_C:
|
|
if (']' == c) {
|
|
prs->context = C_SECT_CDATA_E;
|
|
if (prs->cdataHook && old < pos) {
|
|
err = prs->cdataHook (prs->user_data, &buf[old], pos - old);
|
|
if (IKS_OK != err) return err;
|
|
}
|
|
}
|
|
break;
|
|
|
|
case C_SECT_CDATA_E:
|
|
if (']' == c) {
|
|
prs->context = C_SECT_CDATA_E2;
|
|
} else {
|
|
if (prs->cdataHook) {
|
|
err = prs->cdataHook (prs->user_data, "]", 1);
|
|
if (IKS_OK != err) return err;
|
|
}
|
|
old = pos;
|
|
prs->context = C_SECT_CDATA_C;
|
|
}
|
|
break;
|
|
|
|
case C_SECT_CDATA_E2:
|
|
if ('>' == c) {
|
|
old = pos + 1;
|
|
prs->context = C_CDATA;
|
|
} else if (']' == c) {
|
|
/* ]]] scenario */
|
|
if (prs->cdataHook) {
|
|
err = prs->cdataHook (prs->user_data, "]", 1);
|
|
if (IKS_OK != err) return err;
|
|
}
|
|
old = pos;
|
|
} else {
|
|
if (prs->cdataHook) {
|
|
err = prs->cdataHook (prs->user_data, "]]", 2);
|
|
if (IKS_OK != err) return err;
|
|
}
|
|
old = pos;
|
|
prs->context = C_SECT_CDATA_C;
|
|
}
|
|
break;
|
|
|
|
case C_PI:
|
|
old = pos + 1;
|
|
if ('>' == c) prs->context = C_CDATA;
|
|
break;
|
|
}
|
|
cont:
|
|
if (0 == re) {
|
|
pos++;
|
|
prs->nr_bytes++;
|
|
if ('\n' == c) prs->nr_lines++;
|
|
}
|
|
}
|
|
|
|
if (stack_old != -1)
|
|
STACK_PUSH (buf + stack_old, pos - stack_old);
|
|
|
|
err = IKS_OK;
|
|
if (prs->cdataHook && (prs->context == C_CDATA || prs->context == C_SECT_CDATA_C) && old < pos)
|
|
err = prs->cdataHook (prs->user_data, &buf[old], pos - old);
|
|
return err;
|
|
}
|
|
|
|
int
|
|
iks_parse (iksparser *prs, const char *data, size_t len, int finish)
|
|
{
|
|
if (!data) return IKS_OK;
|
|
if (len == 0) len = strlen (data);
|
|
return sax_core (prs, (char *) data, len);
|
|
}
|
|
|
|
void
|
|
iks_parser_reset (iksparser *prs)
|
|
{
|
|
if (prs->deleteHook) prs->deleteHook (prs->user_data);
|
|
prs->stack_pos = 0;
|
|
prs->context = 0;
|
|
prs->oldcontext = 0;
|
|
prs->tagtype = 0;
|
|
prs->attcur = 0;
|
|
prs->attflag = 0;
|
|
prs->valflag = 0;
|
|
prs->entpos = 0;
|
|
prs->nr_bytes = 0;
|
|
prs->nr_lines = 0;
|
|
prs->uni_max = 0;
|
|
prs->uni_len = 0;
|
|
}
|
|
|
|
void
|
|
iks_parser_delete (iksparser *prs)
|
|
{
|
|
if (prs->deleteHook) prs->deleteHook (prs->user_data);
|
|
if (prs->stack) iks_free (prs->stack);
|
|
if (prs->atts) iks_free (prs->atts);
|
|
if (prs->s) iks_stack_delete (&prs->s); else iks_free (prs);
|
|
}
|