blob: 36b4bfbfc07e53f10cef2941c01b9378250bab93 [file] [log] [blame]
Lev Walkindc06f6b2004-10-20 15:50:55 +00001/*
2 * Copyright (c) 2003, 2004 X/IO Labs, xiolabs.com.
Lev Walkin8b8a7412005-03-09 22:22:18 +00003 * Copyright (c) 2003, 2004, 2005 Lev Walkin <vlm@lionet.info>.
4 * All rights reserved.
Lev Walkindc06f6b2004-10-20 15:50:55 +00005 * Redistribution and modifications are permitted subject to BSD license.
6 */
Lev Walkin59b176e2005-11-26 11:25:14 +00007#include <asn_system.h>
Lev Walkindc06f6b2004-10-20 15:50:55 +00008#include <xer_support.h>
9
10/* Parser states */
11typedef enum {
12 ST_TEXT,
13 ST_TAG_START,
14 ST_TAG_BODY,
15 ST_TAG_QUOTE_WAIT,
16 ST_TAG_QUOTED_STRING,
17 ST_TAG_UNQUOTED_STRING,
Lev Walkin66d17592005-06-01 18:26:38 +000018 ST_COMMENT_WAIT_DASH1, /* "<!--"[1] */
19 ST_COMMENT_WAIT_DASH2, /* "<!--"[2] */
Lev Walkindc06f6b2004-10-20 15:50:55 +000020 ST_COMMENT,
Lev Walkin66d17592005-06-01 18:26:38 +000021 ST_COMMENT_CLO_DASH2, /* "-->"[0] */
22 ST_COMMENT_CLO_RT /* "-->"[1] */
Lev Walkindc06f6b2004-10-20 15:50:55 +000023} pstate_e;
24
Wim Lewis18c2ec92014-07-29 11:30:10 -070025static const int
Lev Walkindc06f6b2004-10-20 15:50:55 +000026_charclass[256] = {
27 0,0,0,0,0,0,0,0, 0,1,1,0,1,1,0,0,
28 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
29 1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
30 2,2,2,2,2,2,2,2, 2,2,0,0,0,0,0,0, /* 01234567 89 */
31 0,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, /* ABCDEFG HIJKLMNO */
32 3,3,3,3,3,3,3,3, 3,3,3,0,0,0,0,0, /* PQRSTUVW XYZ */
33 0,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, /* abcdefg hijklmno */
34 3,3,3,3,3,3,3,3, 3,3,3,0,0,0,0,0 /* pqrstuvw xyz */
35};
36#define WHITESPACE(c) (_charclass[(unsigned char)(c)] == 1)
37#define ALNUM(c) (_charclass[(unsigned char)(c)] >= 2)
38#define ALPHA(c) (_charclass[(unsigned char)(c)] == 3)
39
40/* Aliases for characters, ASCII/UTF-8 */
41#define EXCLAM 0x21 /* '!' */
42#define CQUOTE 0x22 /* '"' */
43#define CDASH 0x2d /* '-' */
44#define CSLASH 0x2f /* '/' */
45#define LANGLE 0x3c /* '<' */
46#define CEQUAL 0x3d /* '=' */
47#define RANGLE 0x3e /* '>' */
Lev Walkin0fab1a62005-03-09 22:19:25 +000048#define CQUEST 0x3f /* '?' */
Lev Walkindc06f6b2004-10-20 15:50:55 +000049
50/* Invoke token callback */
51#define TOKEN_CB_CALL(type, _ns, _current_too, _final) do { \
52 int _ret; \
53 pstate_e ns = _ns; \
54 ssize_t _sz = (p - chunk_start) + _current_too; \
55 if (!_sz) { \
56 /* Shortcut */ \
57 state = _ns; \
58 break; \
59 } \
60 _ret = cb(type, chunk_start, _sz, key); \
61 if(_ret < _sz) { \
62 if(_current_too && _ret == -1) \
63 state = ns; \
64 goto finish; \
65 } \
66 chunk_start = p + _current_too; \
67 state = ns; \
68 } while(0)
69
70#define TOKEN_CB(_type, _ns, _current_too) \
71 TOKEN_CB_CALL(_type, _ns, _current_too, 0)
72
Wim Lewisca15fc62014-07-30 12:30:34 -070073#define PXML_TAG_FINAL_CHUNK_TYPE PXML_TAG_END
74#define PXML_COMMENT_FINAL_CHUNK_TYPE PXML_COMMENT_END
75
Lev Walkindc06f6b2004-10-20 15:50:55 +000076#define TOKEN_CB_FINAL(_type, _ns, _current_too) \
Wim Lewisca15fc62014-07-30 12:30:34 -070077 TOKEN_CB_CALL( _type ## _FINAL_CHUNK_TYPE , _ns, _current_too, 1)
Lev Walkindc06f6b2004-10-20 15:50:55 +000078
79/*
80 * Parser itself
81 */
Lev Walkin0fab1a62005-03-09 22:19:25 +000082ssize_t pxml_parse(int *stateContext, const void *xmlbuf, size_t size, pxml_callback_f *cb, void *key) {
Lev Walkindc06f6b2004-10-20 15:50:55 +000083 pstate_e state = (pstate_e)*stateContext;
Lev Walkin0fab1a62005-03-09 22:19:25 +000084 const char *chunk_start = (const char *)xmlbuf;
85 const char *p = chunk_start;
86 const char *end = p + size;
Lev Walkindc06f6b2004-10-20 15:50:55 +000087
88 for(; p < end; p++) {
Lev Walkin0fab1a62005-03-09 22:19:25 +000089 int C = *(const unsigned char *)p;
Lev Walkindc06f6b2004-10-20 15:50:55 +000090 switch(state) {
91 case ST_TEXT:
92 /*
93 * Initial state: we're in the middle of some text,
94 * or just have started.
95 */
96 if (C == LANGLE)
97 /* We're now in the tag, probably */
98 TOKEN_CB(PXML_TEXT, ST_TAG_START, 0);
99 break;
100 case ST_TAG_START:
101 if (ALPHA(C) || (C == CSLASH))
102 state = ST_TAG_BODY;
103 else if (C == EXCLAM)
104 state = ST_COMMENT_WAIT_DASH1;
105 else
106 /*
107 * Not characters and not whitespace.
108 * Must be something like "3 < 4".
109 */
110 TOKEN_CB(PXML_TEXT, ST_TEXT, 1);/* Flush as data */
111 break;
112 case ST_TAG_BODY:
113 switch(C) {
114 case RANGLE:
115 /* End of the tag */
116 TOKEN_CB_FINAL(PXML_TAG, ST_TEXT, 1);
117 break;
118 case LANGLE:
119 /*
120 * The previous tag wasn't completed, but still
121 * recognized as valid. (Mozilla-compatible)
122 */
123 TOKEN_CB_FINAL(PXML_TAG, ST_TAG_START, 0);
124 break;
125 case CEQUAL:
126 state = ST_TAG_QUOTE_WAIT;
127 break;
128 }
129 break;
130 case ST_TAG_QUOTE_WAIT:
131 /*
132 * State after the equal sign ("=") in the tag.
133 */
134 switch(C) {
135 case CQUOTE:
136 state = ST_TAG_QUOTED_STRING;
137 break;
138 case RANGLE:
139 /* End of the tag */
140 TOKEN_CB_FINAL(PXML_TAG, ST_TEXT, 1);
141 break;
142 default:
143 if(!WHITESPACE(C))
144 /* Unquoted string value */
145 state = ST_TAG_UNQUOTED_STRING;
146 }
147 break;
148 case ST_TAG_QUOTED_STRING:
149 /*
150 * Tag attribute's string value in quotes.
151 */
152 if(C == CQUOTE) {
153 /* Return back to the tag state */
154 state = ST_TAG_BODY;
155 }
156 break;
157 case ST_TAG_UNQUOTED_STRING:
158 if(C == RANGLE) {
159 /* End of the tag */
160 TOKEN_CB_FINAL(PXML_TAG, ST_TEXT, 1);
161 } else if(WHITESPACE(C)) {
162 /* Return back to the tag state */
163 state = ST_TAG_BODY;
164 }
165 break;
166 case ST_COMMENT_WAIT_DASH1:
167 if(C == CDASH) {
168 state = ST_COMMENT_WAIT_DASH2;
169 } else {
170 /* Some ordinary tag. */
171 state = ST_TAG_BODY;
172 }
173 break;
174 case ST_COMMENT_WAIT_DASH2:
175 if(C == CDASH) {
176 /* Seen "<--" */
177 state = ST_COMMENT;
178 } else {
179 /* Some ordinary tag */
180 state = ST_TAG_BODY;
181 }
182 break;
183 case ST_COMMENT:
184 if(C == CDASH) {
185 state = ST_COMMENT_CLO_DASH2;
186 }
187 break;
188 case ST_COMMENT_CLO_DASH2:
189 if(C == CDASH) {
190 state = ST_COMMENT_CLO_RT;
191 } else {
192 /* This is not an end of a comment */
193 state = ST_COMMENT;
194 }
195 break;
196 case ST_COMMENT_CLO_RT:
197 if(C == RANGLE) {
198 TOKEN_CB_FINAL(PXML_COMMENT, ST_TEXT, 1);
Lev Walkin18797bc2005-05-26 19:25:53 +0000199 } else if(C == CDASH) {
200 /* Maintain current state, still waiting for '>' */
Lev Walkindc06f6b2004-10-20 15:50:55 +0000201 } else {
202 state = ST_COMMENT;
203 }
204 break;
205 } /* switch(*ptr) */
206 } /* for() */
207
208 /*
209 * Flush the partially processed chunk, state permitting.
210 */
211 if(p - chunk_start) {
212 switch (state) {
213 case ST_COMMENT:
214 TOKEN_CB(PXML_COMMENT, state, 0);
215 break;
216 case ST_TEXT:
217 TOKEN_CB(PXML_TEXT, state, 0);
218 break;
219 default: break; /* a no-op */
220 }
221 }
222
223finish:
224 *stateContext = (int)state;
Lev Walkin0fab1a62005-03-09 22:19:25 +0000225 return chunk_start - (const char *)xmlbuf;
Lev Walkindc06f6b2004-10-20 15:50:55 +0000226}
227