blob: b2f542cdc6d319803891217e61f0360d62eed71f [file] [log] [blame]
vlm9de248e2004-10-20 15:50:55 +00001/*
2 * Copyright (c) 2003, 2004 X/IO Labs, xiolabs.com.
3 * Copyright (c) 2003, 2004 Lev Walkin <vlm@lionet.info>. All rights reserved.
4 * Redistribution and modifications are permitted subject to BSD license.
5 */
6#include <assert.h>
7#include <errno.h>
8#include <stdlib.h>
9#include <stdio.h>
10#include <sys/types.h>
11
12#include <xer_support.h>
13
14/* Parser states */
15typedef enum {
16 ST_TEXT,
17 ST_TAG_START,
18 ST_TAG_BODY,
19 ST_TAG_QUOTE_WAIT,
20 ST_TAG_QUOTED_STRING,
21 ST_TAG_UNQUOTED_STRING,
22 ST_COMMENT_WAIT_DASH1, // "<!--"[1]
23 ST_COMMENT_WAIT_DASH2, // "<!--"[2]
24 ST_COMMENT,
25 ST_COMMENT_CLO_DASH2, // "-->"[0]
26 ST_COMMENT_CLO_RT // "-->"[1]
27} pstate_e;
28
29static pxml_chunk_type_e final_chunk_type[] = {
30 PXML_TEXT,
31 PXML_TAG_END,
32 PXML_COMMENT_END,
33 PXML_TAG_END,
34 PXML_COMMENT_END,
35};
36
37
38static int
39_charclass[256] = {
40 0,0,0,0,0,0,0,0, 0,1,1,0,1,1,0,0,
41 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
42 1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
43 2,2,2,2,2,2,2,2, 2,2,0,0,0,0,0,0, /* 01234567 89 */
44 0,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, /* ABCDEFG HIJKLMNO */
45 3,3,3,3,3,3,3,3, 3,3,3,0,0,0,0,0, /* PQRSTUVW XYZ */
46 0,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, /* abcdefg hijklmno */
47 3,3,3,3,3,3,3,3, 3,3,3,0,0,0,0,0 /* pqrstuvw xyz */
48};
49#define WHITESPACE(c) (_charclass[(unsigned char)(c)] == 1)
50#define ALNUM(c) (_charclass[(unsigned char)(c)] >= 2)
51#define ALPHA(c) (_charclass[(unsigned char)(c)] == 3)
52
53/* Aliases for characters, ASCII/UTF-8 */
54#define EXCLAM 0x21 /* '!' */
55#define CQUOTE 0x22 /* '"' */
56#define CDASH 0x2d /* '-' */
57#define CSLASH 0x2f /* '/' */
58#define LANGLE 0x3c /* '<' */
59#define CEQUAL 0x3d /* '=' */
60#define RANGLE 0x3e /* '>' */
61
62/* Invoke token callback */
63#define TOKEN_CB_CALL(type, _ns, _current_too, _final) do { \
64 int _ret; \
65 pstate_e ns = _ns; \
66 ssize_t _sz = (p - chunk_start) + _current_too; \
67 if (!_sz) { \
68 /* Shortcut */ \
69 state = _ns; \
70 break; \
71 } \
72 _ret = cb(type, chunk_start, _sz, key); \
73 if(_ret < _sz) { \
74 if(_current_too && _ret == -1) \
75 state = ns; \
76 goto finish; \
77 } \
78 chunk_start = p + _current_too; \
79 state = ns; \
80 } while(0)
81
82#define TOKEN_CB(_type, _ns, _current_too) \
83 TOKEN_CB_CALL(_type, _ns, _current_too, 0)
84
85#define TOKEN_CB_FINAL(_type, _ns, _current_too) \
86 TOKEN_CB_CALL(final_chunk_type[_type], _ns, _current_too, 1)
87
88/*
89 * Parser itself
90 */
vlm6c593842004-10-26 09:03:31 +000091ssize_t pxml_parse(int *stateContext, void *xmlbuf, size_t size, pxml_callback_f *cb, void *key) {
vlm9de248e2004-10-20 15:50:55 +000092 pstate_e state = (pstate_e)*stateContext;
93 char *chunk_start = (char *)xmlbuf;
94 char *p = chunk_start;
95 char *end = p + size;
96
97 for(; p < end; p++) {
98 int C = *(unsigned char *)p;
99 switch(state) {
100 case ST_TEXT:
101 /*
102 * Initial state: we're in the middle of some text,
103 * or just have started.
104 */
105 if (C == LANGLE)
106 /* We're now in the tag, probably */
107 TOKEN_CB(PXML_TEXT, ST_TAG_START, 0);
108 break;
109 case ST_TAG_START:
110 if (ALPHA(C) || (C == CSLASH))
111 state = ST_TAG_BODY;
112 else if (C == EXCLAM)
113 state = ST_COMMENT_WAIT_DASH1;
114 else
115 /*
116 * Not characters and not whitespace.
117 * Must be something like "3 < 4".
118 */
119 TOKEN_CB(PXML_TEXT, ST_TEXT, 1);/* Flush as data */
120 break;
121 case ST_TAG_BODY:
122 switch(C) {
123 case RANGLE:
124 /* End of the tag */
125 TOKEN_CB_FINAL(PXML_TAG, ST_TEXT, 1);
126 break;
127 case LANGLE:
128 /*
129 * The previous tag wasn't completed, but still
130 * recognized as valid. (Mozilla-compatible)
131 */
132 TOKEN_CB_FINAL(PXML_TAG, ST_TAG_START, 0);
133 break;
134 case CEQUAL:
135 state = ST_TAG_QUOTE_WAIT;
136 break;
137 }
138 break;
139 case ST_TAG_QUOTE_WAIT:
140 /*
141 * State after the equal sign ("=") in the tag.
142 */
143 switch(C) {
144 case CQUOTE:
145 state = ST_TAG_QUOTED_STRING;
146 break;
147 case RANGLE:
148 /* End of the tag */
149 TOKEN_CB_FINAL(PXML_TAG, ST_TEXT, 1);
150 break;
151 default:
152 if(!WHITESPACE(C))
153 /* Unquoted string value */
154 state = ST_TAG_UNQUOTED_STRING;
155 }
156 break;
157 case ST_TAG_QUOTED_STRING:
158 /*
159 * Tag attribute's string value in quotes.
160 */
161 if(C == CQUOTE) {
162 /* Return back to the tag state */
163 state = ST_TAG_BODY;
164 }
165 break;
166 case ST_TAG_UNQUOTED_STRING:
167 if(C == RANGLE) {
168 /* End of the tag */
169 TOKEN_CB_FINAL(PXML_TAG, ST_TEXT, 1);
170 } else if(WHITESPACE(C)) {
171 /* Return back to the tag state */
172 state = ST_TAG_BODY;
173 }
174 break;
175 case ST_COMMENT_WAIT_DASH1:
176 if(C == CDASH) {
177 state = ST_COMMENT_WAIT_DASH2;
178 } else {
179 /* Some ordinary tag. */
180 state = ST_TAG_BODY;
181 }
182 break;
183 case ST_COMMENT_WAIT_DASH2:
184 if(C == CDASH) {
185 /* Seen "<--" */
186 state = ST_COMMENT;
187 } else {
188 /* Some ordinary tag */
189 state = ST_TAG_BODY;
190 }
191 break;
192 case ST_COMMENT:
193 if(C == CDASH) {
194 state = ST_COMMENT_CLO_DASH2;
195 }
196 break;
197 case ST_COMMENT_CLO_DASH2:
198 if(C == CDASH) {
199 state = ST_COMMENT_CLO_RT;
200 } else {
201 /* This is not an end of a comment */
202 state = ST_COMMENT;
203 }
204 break;
205 case ST_COMMENT_CLO_RT:
206 if(C == RANGLE) {
207 TOKEN_CB_FINAL(PXML_COMMENT, ST_TEXT, 1);
208 } else {
209 state = ST_COMMENT;
210 }
211 break;
212 } /* switch(*ptr) */
213 } /* for() */
214
215 /*
216 * Flush the partially processed chunk, state permitting.
217 */
218 if(p - chunk_start) {
219 switch (state) {
220 case ST_COMMENT:
221 TOKEN_CB(PXML_COMMENT, state, 0);
222 break;
223 case ST_TEXT:
224 TOKEN_CB(PXML_TEXT, state, 0);
225 break;
226 default: break; /* a no-op */
227 }
228 }
229
230finish:
231 *stateContext = (int)state;
232 return chunk_start - (char *)xmlbuf;
233}
234