blob: 14daffb8b4454670d9e4c41fb52214a61214cc33 [file] [log] [blame]
Lev Walkindc06f6b2004-10-20 15:50:55 +00001/*
2 * Copyright (c) 2003, 2004 X/IO Labs, xiolabs.com.
Lev Walkin8b8a7412005-03-09 22:22:18 +00003 * Copyright (c) 2003, 2004, 2005 Lev Walkin <vlm@lionet.info>.
4 * All rights reserved.
Lev Walkindc06f6b2004-10-20 15:50:55 +00005 * Redistribution and modifications are permitted subject to BSD license.
6 */
7#include <assert.h>
8#include <errno.h>
9#include <stdlib.h>
10#include <stdio.h>
11#include <sys/types.h>
12
13#include <xer_support.h>
14
15/* Parser states */
16typedef enum {
17 ST_TEXT,
18 ST_TAG_START,
19 ST_TAG_BODY,
20 ST_TAG_QUOTE_WAIT,
21 ST_TAG_QUOTED_STRING,
22 ST_TAG_UNQUOTED_STRING,
23 ST_COMMENT_WAIT_DASH1, // "<!--"[1]
24 ST_COMMENT_WAIT_DASH2, // "<!--"[2]
25 ST_COMMENT,
26 ST_COMMENT_CLO_DASH2, // "-->"[0]
27 ST_COMMENT_CLO_RT // "-->"[1]
28} pstate_e;
29
30static pxml_chunk_type_e final_chunk_type[] = {
31 PXML_TEXT,
32 PXML_TAG_END,
33 PXML_COMMENT_END,
34 PXML_TAG_END,
35 PXML_COMMENT_END,
36};
37
38
39static int
40_charclass[256] = {
41 0,0,0,0,0,0,0,0, 0,1,1,0,1,1,0,0,
42 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
43 1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
44 2,2,2,2,2,2,2,2, 2,2,0,0,0,0,0,0, /* 01234567 89 */
45 0,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, /* ABCDEFG HIJKLMNO */
46 3,3,3,3,3,3,3,3, 3,3,3,0,0,0,0,0, /* PQRSTUVW XYZ */
47 0,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, /* abcdefg hijklmno */
48 3,3,3,3,3,3,3,3, 3,3,3,0,0,0,0,0 /* pqrstuvw xyz */
49};
50#define WHITESPACE(c) (_charclass[(unsigned char)(c)] == 1)
51#define ALNUM(c) (_charclass[(unsigned char)(c)] >= 2)
52#define ALPHA(c) (_charclass[(unsigned char)(c)] == 3)
53
54/* Aliases for characters, ASCII/UTF-8 */
55#define EXCLAM 0x21 /* '!' */
56#define CQUOTE 0x22 /* '"' */
57#define CDASH 0x2d /* '-' */
58#define CSLASH 0x2f /* '/' */
59#define LANGLE 0x3c /* '<' */
60#define CEQUAL 0x3d /* '=' */
61#define RANGLE 0x3e /* '>' */
Lev Walkin0fab1a62005-03-09 22:19:25 +000062#define CQUEST 0x3f /* '?' */
Lev Walkindc06f6b2004-10-20 15:50:55 +000063
64/* Invoke token callback */
65#define TOKEN_CB_CALL(type, _ns, _current_too, _final) do { \
66 int _ret; \
67 pstate_e ns = _ns; \
68 ssize_t _sz = (p - chunk_start) + _current_too; \
69 if (!_sz) { \
70 /* Shortcut */ \
71 state = _ns; \
72 break; \
73 } \
74 _ret = cb(type, chunk_start, _sz, key); \
75 if(_ret < _sz) { \
76 if(_current_too && _ret == -1) \
77 state = ns; \
78 goto finish; \
79 } \
80 chunk_start = p + _current_too; \
81 state = ns; \
82 } while(0)
83
84#define TOKEN_CB(_type, _ns, _current_too) \
85 TOKEN_CB_CALL(_type, _ns, _current_too, 0)
86
87#define TOKEN_CB_FINAL(_type, _ns, _current_too) \
88 TOKEN_CB_CALL(final_chunk_type[_type], _ns, _current_too, 1)
89
90/*
91 * Parser itself
92 */
Lev Walkin0fab1a62005-03-09 22:19:25 +000093ssize_t pxml_parse(int *stateContext, const void *xmlbuf, size_t size, pxml_callback_f *cb, void *key) {
Lev Walkindc06f6b2004-10-20 15:50:55 +000094 pstate_e state = (pstate_e)*stateContext;
Lev Walkin0fab1a62005-03-09 22:19:25 +000095 const char *chunk_start = (const char *)xmlbuf;
96 const char *p = chunk_start;
97 const char *end = p + size;
Lev Walkindc06f6b2004-10-20 15:50:55 +000098
99 for(; p < end; p++) {
Lev Walkin0fab1a62005-03-09 22:19:25 +0000100 int C = *(const unsigned char *)p;
Lev Walkindc06f6b2004-10-20 15:50:55 +0000101 switch(state) {
102 case ST_TEXT:
103 /*
104 * Initial state: we're in the middle of some text,
105 * or just have started.
106 */
107 if (C == LANGLE)
108 /* We're now in the tag, probably */
109 TOKEN_CB(PXML_TEXT, ST_TAG_START, 0);
110 break;
111 case ST_TAG_START:
112 if (ALPHA(C) || (C == CSLASH))
113 state = ST_TAG_BODY;
114 else if (C == EXCLAM)
115 state = ST_COMMENT_WAIT_DASH1;
116 else
117 /*
118 * Not characters and not whitespace.
119 * Must be something like "3 < 4".
120 */
121 TOKEN_CB(PXML_TEXT, ST_TEXT, 1);/* Flush as data */
122 break;
123 case ST_TAG_BODY:
124 switch(C) {
125 case RANGLE:
126 /* End of the tag */
127 TOKEN_CB_FINAL(PXML_TAG, ST_TEXT, 1);
128 break;
129 case LANGLE:
130 /*
131 * The previous tag wasn't completed, but still
132 * recognized as valid. (Mozilla-compatible)
133 */
134 TOKEN_CB_FINAL(PXML_TAG, ST_TAG_START, 0);
135 break;
136 case CEQUAL:
137 state = ST_TAG_QUOTE_WAIT;
138 break;
139 }
140 break;
141 case ST_TAG_QUOTE_WAIT:
142 /*
143 * State after the equal sign ("=") in the tag.
144 */
145 switch(C) {
146 case CQUOTE:
147 state = ST_TAG_QUOTED_STRING;
148 break;
149 case RANGLE:
150 /* End of the tag */
151 TOKEN_CB_FINAL(PXML_TAG, ST_TEXT, 1);
152 break;
153 default:
154 if(!WHITESPACE(C))
155 /* Unquoted string value */
156 state = ST_TAG_UNQUOTED_STRING;
157 }
158 break;
159 case ST_TAG_QUOTED_STRING:
160 /*
161 * Tag attribute's string value in quotes.
162 */
163 if(C == CQUOTE) {
164 /* Return back to the tag state */
165 state = ST_TAG_BODY;
166 }
167 break;
168 case ST_TAG_UNQUOTED_STRING:
169 if(C == RANGLE) {
170 /* End of the tag */
171 TOKEN_CB_FINAL(PXML_TAG, ST_TEXT, 1);
172 } else if(WHITESPACE(C)) {
173 /* Return back to the tag state */
174 state = ST_TAG_BODY;
175 }
176 break;
177 case ST_COMMENT_WAIT_DASH1:
178 if(C == CDASH) {
179 state = ST_COMMENT_WAIT_DASH2;
180 } else {
181 /* Some ordinary tag. */
182 state = ST_TAG_BODY;
183 }
184 break;
185 case ST_COMMENT_WAIT_DASH2:
186 if(C == CDASH) {
187 /* Seen "<--" */
188 state = ST_COMMENT;
189 } else {
190 /* Some ordinary tag */
191 state = ST_TAG_BODY;
192 }
193 break;
194 case ST_COMMENT:
195 if(C == CDASH) {
196 state = ST_COMMENT_CLO_DASH2;
197 }
198 break;
199 case ST_COMMENT_CLO_DASH2:
200 if(C == CDASH) {
201 state = ST_COMMENT_CLO_RT;
202 } else {
203 /* This is not an end of a comment */
204 state = ST_COMMENT;
205 }
206 break;
207 case ST_COMMENT_CLO_RT:
208 if(C == RANGLE) {
209 TOKEN_CB_FINAL(PXML_COMMENT, ST_TEXT, 1);
210 } else {
211 state = ST_COMMENT;
212 }
213 break;
214 } /* switch(*ptr) */
215 } /* for() */
216
217 /*
218 * Flush the partially processed chunk, state permitting.
219 */
220 if(p - chunk_start) {
221 switch (state) {
222 case ST_COMMENT:
223 TOKEN_CB(PXML_COMMENT, state, 0);
224 break;
225 case ST_TEXT:
226 TOKEN_CB(PXML_TEXT, state, 0);
227 break;
228 default: break; /* a no-op */
229 }
230 }
231
232finish:
233 *stateContext = (int)state;
Lev Walkin0fab1a62005-03-09 22:19:25 +0000234 return chunk_start - (const char *)xmlbuf;
Lev Walkindc06f6b2004-10-20 15:50:55 +0000235}
236