blob: 9e34e6923467a436e20690633c4091b6db4683cd [file] [log] [blame]
Lev Walkindc06f6b2004-10-20 15:50:55 +00001/*
2 * Copyright (c) 2003, 2004 X/IO Labs, xiolabs.com.
Lev Walkin8b8a7412005-03-09 22:22:18 +00003 * Copyright (c) 2003, 2004, 2005 Lev Walkin <vlm@lionet.info>.
4 * All rights reserved.
Lev Walkindc06f6b2004-10-20 15:50:55 +00005 * Redistribution and modifications are permitted subject to BSD license.
6 */
Lev Walkin59b176e2005-11-26 11:25:14 +00007#include <asn_system.h>
Lev Walkindc06f6b2004-10-20 15:50:55 +00008#include <xer_support.h>
9
10/* Parser states */
11typedef enum {
12 ST_TEXT,
13 ST_TAG_START,
14 ST_TAG_BODY,
15 ST_TAG_QUOTE_WAIT,
16 ST_TAG_QUOTED_STRING,
17 ST_TAG_UNQUOTED_STRING,
Lev Walkin66d17592005-06-01 18:26:38 +000018 ST_COMMENT_WAIT_DASH1, /* "<!--"[1] */
19 ST_COMMENT_WAIT_DASH2, /* "<!--"[2] */
Lev Walkindc06f6b2004-10-20 15:50:55 +000020 ST_COMMENT,
Lev Walkin66d17592005-06-01 18:26:38 +000021 ST_COMMENT_CLO_DASH2, /* "-->"[0] */
22 ST_COMMENT_CLO_RT /* "-->"[1] */
Lev Walkindc06f6b2004-10-20 15:50:55 +000023} pstate_e;
24
25static pxml_chunk_type_e final_chunk_type[] = {
26 PXML_TEXT,
27 PXML_TAG_END,
28 PXML_COMMENT_END,
29 PXML_TAG_END,
30 PXML_COMMENT_END,
31};
32
33
34static int
35_charclass[256] = {
36 0,0,0,0,0,0,0,0, 0,1,1,0,1,1,0,0,
37 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
38 1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
39 2,2,2,2,2,2,2,2, 2,2,0,0,0,0,0,0, /* 01234567 89 */
40 0,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, /* ABCDEFG HIJKLMNO */
41 3,3,3,3,3,3,3,3, 3,3,3,0,0,0,0,0, /* PQRSTUVW XYZ */
42 0,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, /* abcdefg hijklmno */
43 3,3,3,3,3,3,3,3, 3,3,3,0,0,0,0,0 /* pqrstuvw xyz */
44};
45#define WHITESPACE(c) (_charclass[(unsigned char)(c)] == 1)
46#define ALNUM(c) (_charclass[(unsigned char)(c)] >= 2)
47#define ALPHA(c) (_charclass[(unsigned char)(c)] == 3)
48
49/* Aliases for characters, ASCII/UTF-8 */
50#define EXCLAM 0x21 /* '!' */
51#define CQUOTE 0x22 /* '"' */
52#define CDASH 0x2d /* '-' */
53#define CSLASH 0x2f /* '/' */
54#define LANGLE 0x3c /* '<' */
55#define CEQUAL 0x3d /* '=' */
56#define RANGLE 0x3e /* '>' */
Lev Walkin0fab1a62005-03-09 22:19:25 +000057#define CQUEST 0x3f /* '?' */
Lev Walkindc06f6b2004-10-20 15:50:55 +000058
59/* Invoke token callback */
60#define TOKEN_CB_CALL(type, _ns, _current_too, _final) do { \
61 int _ret; \
62 pstate_e ns = _ns; \
63 ssize_t _sz = (p - chunk_start) + _current_too; \
64 if (!_sz) { \
65 /* Shortcut */ \
66 state = _ns; \
67 break; \
68 } \
69 _ret = cb(type, chunk_start, _sz, key); \
70 if(_ret < _sz) { \
71 if(_current_too && _ret == -1) \
72 state = ns; \
73 goto finish; \
74 } \
75 chunk_start = p + _current_too; \
76 state = ns; \
77 } while(0)
78
79#define TOKEN_CB(_type, _ns, _current_too) \
80 TOKEN_CB_CALL(_type, _ns, _current_too, 0)
81
82#define TOKEN_CB_FINAL(_type, _ns, _current_too) \
83 TOKEN_CB_CALL(final_chunk_type[_type], _ns, _current_too, 1)
84
85/*
86 * Parser itself
87 */
Lev Walkin0fab1a62005-03-09 22:19:25 +000088ssize_t pxml_parse(int *stateContext, const void *xmlbuf, size_t size, pxml_callback_f *cb, void *key) {
Lev Walkindc06f6b2004-10-20 15:50:55 +000089 pstate_e state = (pstate_e)*stateContext;
Lev Walkin0fab1a62005-03-09 22:19:25 +000090 const char *chunk_start = (const char *)xmlbuf;
91 const char *p = chunk_start;
92 const char *end = p + size;
Lev Walkindc06f6b2004-10-20 15:50:55 +000093
94 for(; p < end; p++) {
Lev Walkin0fab1a62005-03-09 22:19:25 +000095 int C = *(const unsigned char *)p;
Lev Walkindc06f6b2004-10-20 15:50:55 +000096 switch(state) {
97 case ST_TEXT:
98 /*
99 * Initial state: we're in the middle of some text,
100 * or just have started.
101 */
102 if (C == LANGLE)
103 /* We're now in the tag, probably */
104 TOKEN_CB(PXML_TEXT, ST_TAG_START, 0);
105 break;
106 case ST_TAG_START:
107 if (ALPHA(C) || (C == CSLASH))
108 state = ST_TAG_BODY;
109 else if (C == EXCLAM)
110 state = ST_COMMENT_WAIT_DASH1;
111 else
112 /*
113 * Not characters and not whitespace.
114 * Must be something like "3 < 4".
115 */
116 TOKEN_CB(PXML_TEXT, ST_TEXT, 1);/* Flush as data */
117 break;
118 case ST_TAG_BODY:
119 switch(C) {
120 case RANGLE:
121 /* End of the tag */
122 TOKEN_CB_FINAL(PXML_TAG, ST_TEXT, 1);
123 break;
124 case LANGLE:
125 /*
126 * The previous tag wasn't completed, but still
127 * recognized as valid. (Mozilla-compatible)
128 */
129 TOKEN_CB_FINAL(PXML_TAG, ST_TAG_START, 0);
130 break;
131 case CEQUAL:
132 state = ST_TAG_QUOTE_WAIT;
133 break;
134 }
135 break;
136 case ST_TAG_QUOTE_WAIT:
137 /*
138 * State after the equal sign ("=") in the tag.
139 */
140 switch(C) {
141 case CQUOTE:
142 state = ST_TAG_QUOTED_STRING;
143 break;
144 case RANGLE:
145 /* End of the tag */
146 TOKEN_CB_FINAL(PXML_TAG, ST_TEXT, 1);
147 break;
148 default:
149 if(!WHITESPACE(C))
150 /* Unquoted string value */
151 state = ST_TAG_UNQUOTED_STRING;
152 }
153 break;
154 case ST_TAG_QUOTED_STRING:
155 /*
156 * Tag attribute's string value in quotes.
157 */
158 if(C == CQUOTE) {
159 /* Return back to the tag state */
160 state = ST_TAG_BODY;
161 }
162 break;
163 case ST_TAG_UNQUOTED_STRING:
164 if(C == RANGLE) {
165 /* End of the tag */
166 TOKEN_CB_FINAL(PXML_TAG, ST_TEXT, 1);
167 } else if(WHITESPACE(C)) {
168 /* Return back to the tag state */
169 state = ST_TAG_BODY;
170 }
171 break;
172 case ST_COMMENT_WAIT_DASH1:
173 if(C == CDASH) {
174 state = ST_COMMENT_WAIT_DASH2;
175 } else {
176 /* Some ordinary tag. */
177 state = ST_TAG_BODY;
178 }
179 break;
180 case ST_COMMENT_WAIT_DASH2:
181 if(C == CDASH) {
182 /* Seen "<--" */
183 state = ST_COMMENT;
184 } else {
185 /* Some ordinary tag */
186 state = ST_TAG_BODY;
187 }
188 break;
189 case ST_COMMENT:
190 if(C == CDASH) {
191 state = ST_COMMENT_CLO_DASH2;
192 }
193 break;
194 case ST_COMMENT_CLO_DASH2:
195 if(C == CDASH) {
196 state = ST_COMMENT_CLO_RT;
197 } else {
198 /* This is not an end of a comment */
199 state = ST_COMMENT;
200 }
201 break;
202 case ST_COMMENT_CLO_RT:
203 if(C == RANGLE) {
204 TOKEN_CB_FINAL(PXML_COMMENT, ST_TEXT, 1);
Lev Walkin18797bc2005-05-26 19:25:53 +0000205 } else if(C == CDASH) {
206 /* Maintain current state, still waiting for '>' */
Lev Walkindc06f6b2004-10-20 15:50:55 +0000207 } else {
208 state = ST_COMMENT;
209 }
210 break;
211 } /* switch(*ptr) */
212 } /* for() */
213
214 /*
215 * Flush the partially processed chunk, state permitting.
216 */
217 if(p - chunk_start) {
218 switch (state) {
219 case ST_COMMENT:
220 TOKEN_CB(PXML_COMMENT, state, 0);
221 break;
222 case ST_TEXT:
223 TOKEN_CB(PXML_TEXT, state, 0);
224 break;
225 default: break; /* a no-op */
226 }
227 }
228
229finish:
230 *stateContext = (int)state;
Lev Walkin0fab1a62005-03-09 22:19:25 +0000231 return chunk_start - (const char *)xmlbuf;
Lev Walkindc06f6b2004-10-20 15:50:55 +0000232}
233