blob: 21c92b09e8abfedcf4c064f2ec50d594fb960697 [file] [log] [blame]
Lev Walkindc06f6b2004-10-20 15:50:55 +00001/*
2 * Copyright (c) 2003, 2004 X/IO Labs, xiolabs.com.
3 * Copyright (c) 2003, 2004 Lev Walkin <vlm@lionet.info>. All rights reserved.
4 * Redistribution and modifications are permitted subject to BSD license.
5 */
6#include <assert.h>
7#include <errno.h>
8#include <stdlib.h>
9#include <stdio.h>
10#include <sys/types.h>
11
12#include <xer_support.h>
13
14/* Parser states */
15typedef enum {
16 ST_TEXT,
17 ST_TAG_START,
18 ST_TAG_BODY,
19 ST_TAG_QUOTE_WAIT,
20 ST_TAG_QUOTED_STRING,
21 ST_TAG_UNQUOTED_STRING,
22 ST_COMMENT_WAIT_DASH1, // "<!--"[1]
23 ST_COMMENT_WAIT_DASH2, // "<!--"[2]
24 ST_COMMENT,
25 ST_COMMENT_CLO_DASH2, // "-->"[0]
26 ST_COMMENT_CLO_RT // "-->"[1]
27} pstate_e;
28
29static pxml_chunk_type_e final_chunk_type[] = {
30 PXML_TEXT,
31 PXML_TAG_END,
32 PXML_COMMENT_END,
33 PXML_TAG_END,
34 PXML_COMMENT_END,
35};
36
37
38static int
39_charclass[256] = {
40 0,0,0,0,0,0,0,0, 0,1,1,0,1,1,0,0,
41 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
42 1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
43 2,2,2,2,2,2,2,2, 2,2,0,0,0,0,0,0, /* 01234567 89 */
44 0,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, /* ABCDEFG HIJKLMNO */
45 3,3,3,3,3,3,3,3, 3,3,3,0,0,0,0,0, /* PQRSTUVW XYZ */
46 0,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, /* abcdefg hijklmno */
47 3,3,3,3,3,3,3,3, 3,3,3,0,0,0,0,0 /* pqrstuvw xyz */
48};
49#define WHITESPACE(c) (_charclass[(unsigned char)(c)] == 1)
50#define ALNUM(c) (_charclass[(unsigned char)(c)] >= 2)
51#define ALPHA(c) (_charclass[(unsigned char)(c)] == 3)
52
53/* Aliases for characters, ASCII/UTF-8 */
54#define EXCLAM 0x21 /* '!' */
55#define CQUOTE 0x22 /* '"' */
56#define CDASH 0x2d /* '-' */
57#define CSLASH 0x2f /* '/' */
58#define LANGLE 0x3c /* '<' */
59#define CEQUAL 0x3d /* '=' */
60#define RANGLE 0x3e /* '>' */
Lev Walkin0fab1a62005-03-09 22:19:25 +000061#define CQUEST 0x3f /* '?' */
Lev Walkindc06f6b2004-10-20 15:50:55 +000062
63/* Invoke token callback */
64#define TOKEN_CB_CALL(type, _ns, _current_too, _final) do { \
65 int _ret; \
66 pstate_e ns = _ns; \
67 ssize_t _sz = (p - chunk_start) + _current_too; \
68 if (!_sz) { \
69 /* Shortcut */ \
70 state = _ns; \
71 break; \
72 } \
73 _ret = cb(type, chunk_start, _sz, key); \
74 if(_ret < _sz) { \
75 if(_current_too && _ret == -1) \
76 state = ns; \
77 goto finish; \
78 } \
79 chunk_start = p + _current_too; \
80 state = ns; \
81 } while(0)
82
83#define TOKEN_CB(_type, _ns, _current_too) \
84 TOKEN_CB_CALL(_type, _ns, _current_too, 0)
85
86#define TOKEN_CB_FINAL(_type, _ns, _current_too) \
87 TOKEN_CB_CALL(final_chunk_type[_type], _ns, _current_too, 1)
88
89/*
90 * Parser itself
91 */
Lev Walkin0fab1a62005-03-09 22:19:25 +000092ssize_t pxml_parse(int *stateContext, const void *xmlbuf, size_t size, pxml_callback_f *cb, void *key) {
Lev Walkindc06f6b2004-10-20 15:50:55 +000093 pstate_e state = (pstate_e)*stateContext;
Lev Walkin0fab1a62005-03-09 22:19:25 +000094 const char *chunk_start = (const char *)xmlbuf;
95 const char *p = chunk_start;
96 const char *end = p + size;
Lev Walkindc06f6b2004-10-20 15:50:55 +000097
98 for(; p < end; p++) {
Lev Walkin0fab1a62005-03-09 22:19:25 +000099 int C = *(const unsigned char *)p;
Lev Walkindc06f6b2004-10-20 15:50:55 +0000100 switch(state) {
101 case ST_TEXT:
102 /*
103 * Initial state: we're in the middle of some text,
104 * or just have started.
105 */
106 if (C == LANGLE)
107 /* We're now in the tag, probably */
108 TOKEN_CB(PXML_TEXT, ST_TAG_START, 0);
109 break;
110 case ST_TAG_START:
111 if (ALPHA(C) || (C == CSLASH))
112 state = ST_TAG_BODY;
113 else if (C == EXCLAM)
114 state = ST_COMMENT_WAIT_DASH1;
115 else
116 /*
117 * Not characters and not whitespace.
118 * Must be something like "3 < 4".
119 */
120 TOKEN_CB(PXML_TEXT, ST_TEXT, 1);/* Flush as data */
121 break;
122 case ST_TAG_BODY:
123 switch(C) {
124 case RANGLE:
125 /* End of the tag */
126 TOKEN_CB_FINAL(PXML_TAG, ST_TEXT, 1);
127 break;
128 case LANGLE:
129 /*
130 * The previous tag wasn't completed, but still
131 * recognized as valid. (Mozilla-compatible)
132 */
133 TOKEN_CB_FINAL(PXML_TAG, ST_TAG_START, 0);
134 break;
135 case CEQUAL:
136 state = ST_TAG_QUOTE_WAIT;
137 break;
138 }
139 break;
140 case ST_TAG_QUOTE_WAIT:
141 /*
142 * State after the equal sign ("=") in the tag.
143 */
144 switch(C) {
145 case CQUOTE:
146 state = ST_TAG_QUOTED_STRING;
147 break;
148 case RANGLE:
149 /* End of the tag */
150 TOKEN_CB_FINAL(PXML_TAG, ST_TEXT, 1);
151 break;
152 default:
153 if(!WHITESPACE(C))
154 /* Unquoted string value */
155 state = ST_TAG_UNQUOTED_STRING;
156 }
157 break;
158 case ST_TAG_QUOTED_STRING:
159 /*
160 * Tag attribute's string value in quotes.
161 */
162 if(C == CQUOTE) {
163 /* Return back to the tag state */
164 state = ST_TAG_BODY;
165 }
166 break;
167 case ST_TAG_UNQUOTED_STRING:
168 if(C == RANGLE) {
169 /* End of the tag */
170 TOKEN_CB_FINAL(PXML_TAG, ST_TEXT, 1);
171 } else if(WHITESPACE(C)) {
172 /* Return back to the tag state */
173 state = ST_TAG_BODY;
174 }
175 break;
176 case ST_COMMENT_WAIT_DASH1:
177 if(C == CDASH) {
178 state = ST_COMMENT_WAIT_DASH2;
179 } else {
180 /* Some ordinary tag. */
181 state = ST_TAG_BODY;
182 }
183 break;
184 case ST_COMMENT_WAIT_DASH2:
185 if(C == CDASH) {
186 /* Seen "<--" */
187 state = ST_COMMENT;
188 } else {
189 /* Some ordinary tag */
190 state = ST_TAG_BODY;
191 }
192 break;
193 case ST_COMMENT:
194 if(C == CDASH) {
195 state = ST_COMMENT_CLO_DASH2;
196 }
197 break;
198 case ST_COMMENT_CLO_DASH2:
199 if(C == CDASH) {
200 state = ST_COMMENT_CLO_RT;
201 } else {
202 /* This is not an end of a comment */
203 state = ST_COMMENT;
204 }
205 break;
206 case ST_COMMENT_CLO_RT:
207 if(C == RANGLE) {
208 TOKEN_CB_FINAL(PXML_COMMENT, ST_TEXT, 1);
209 } else {
210 state = ST_COMMENT;
211 }
212 break;
213 } /* switch(*ptr) */
214 } /* for() */
215
216 /*
217 * Flush the partially processed chunk, state permitting.
218 */
219 if(p - chunk_start) {
220 switch (state) {
221 case ST_COMMENT:
222 TOKEN_CB(PXML_COMMENT, state, 0);
223 break;
224 case ST_TEXT:
225 TOKEN_CB(PXML_TEXT, state, 0);
226 break;
227 default: break; /* a no-op */
228 }
229 }
230
231finish:
232 *stateContext = (int)state;
Lev Walkin0fab1a62005-03-09 22:19:25 +0000233 return chunk_start - (const char *)xmlbuf;
Lev Walkindc06f6b2004-10-20 15:50:55 +0000234}
235