Neels Hofmeyr | 7ab5fc1 | 2018-11-15 23:29:56 +0100 | [diff] [blame] | 1 | #!/usr/bin/env python3 |
| 2 | |
| 3 | '''Using mad regexes, automatically make sure that all structs with sub-byte |
| 4 | integers have matching big-endian definitions. The idea is to save a lot of |
| 5 | manual effort, and to automatically verify that there are no errors. |
| 6 | This script most certainly has numerous holes and shortcomings, but actually, |
| 7 | if you hit problems with it, rather adjust your coding style so that this |
| 8 | script can deal with it...''' |
| 9 | |
| 10 | import re |
| 11 | import sys |
| 12 | import codecs |
| 13 | import os.path |
| 14 | |
| 15 | re_struct_start = re.compile(r'^struct\s*[a-zA-Z_][a-zA-Z_0-9]*\s*{\s*$') |
| 16 | re_struct_end = re.compile(r'^}[^;]*;\s*$') |
| 17 | |
| 18 | re_substruct_start = re.compile(r'^\s+struct\s*{\s*$') |
| 19 | re_substruct_end = re.compile(r'^\s+}\s*([^;]*\s)[a-zA-Z_][a-zA-Z_0-9]*\s*;\s*$') |
Neels Hofmeyr | bd58697 | 2020-05-14 17:27:59 +0200 | [diff] [blame] | 20 | re_unnamed_substruct_end = re.compile(r'^\s+}\s*;\s*$') |
Neels Hofmeyr | 7ab5fc1 | 2018-11-15 23:29:56 +0100 | [diff] [blame] | 21 | |
| 22 | re_int_def = re.compile(r'(^\s*((const|unsigned|signed|char|int|long|int[0-9]+_t|uint[0-9]_t)\s+)+\s*)([^;]*;)', |
| 23 | re.DOTALL | re.MULTILINE) |
| 24 | re_int_members = re.compile(r'([a-zA-Z_][a-zA-Z_0-9]*|[a-zA-Z_][a-zA-Z_0-9]*\s*:\s*[0-9]+)\s*[,;]\s*', re.DOTALL | re.MULTILINE) |
| 25 | |
| 26 | re_little_endian_ifdef = re.compile(r'#\s*(if|elif)\s+OSMO_IS_LITTLE_ENDIAN\s*(==\s*1\s*|)'); |
| 27 | re_big_endian_ifdef = re.compile(r'#\s*(if|elif)\s+OSMO_IS_BIG_ENDIAN\s*'); |
| 28 | re_else = re.compile(r'#\s*else\s*'); |
| 29 | re_endif = re.compile(r'#\s*endif\s*'); |
| 30 | |
| 31 | re_c_comment = re.compile(r'(/\*[^*]+\*/|//.?$)') |
| 32 | |
| 33 | def remove_c_comments(code_str): |
| 34 | return ''.join(re_c_comment.split(code_str)[::2]) |
| 35 | |
| 36 | def section_struct_body(struct_body_lines): |
| 37 | '''divide a top-level-struct body into sections of |
| 38 | ['arbitrary string', ['body;\n', 'lines;\n'], 'arbitrary string', ...] |
| 39 | Aim: handle each sub-struct on its own, and if there already are ifdefs for |
| 40 | little and big endian, keep just the little endian bit and derive big |
| 41 | endian from it. |
| 42 | An arbitrary string is anything other than struct member definitions, like |
| 43 | a 'struct {', '} sub_name;', ... |
| 44 | "body lines" are lines that define struct members (possibly with comments). |
| 45 | Return: list of alternate arbitrary strings and variable definitions. |
| 46 | ''' |
| 47 | |
| 48 | # these globals are needed so that end_def() can change them from inside |
| 49 | # the function. Not very nice style, but easiest implementation. |
| 50 | global struct_body_parts |
| 51 | global arbitrary_part |
| 52 | global def_part |
| 53 | |
| 54 | struct_body_parts = [] |
| 55 | arbitrary_part = [] |
| 56 | def_part = [] |
| 57 | |
| 58 | def end_def(): |
| 59 | '''if there is any content, flush out recorded parts (def_part, |
| 60 | arbitrary_part) and start a new part. In short, cut a section |
| 61 | boundary.''' |
| 62 | global struct_body_parts |
| 63 | global arbitrary_part |
| 64 | global def_part |
| 65 | |
| 66 | if def_part: |
| 67 | struct_body_parts.append(arbitrary_part) |
| 68 | arbitrary_part = [] |
| 69 | struct_body_parts.append(def_part) |
| 70 | def_part = [] |
| 71 | |
| 72 | j = 0 |
| 73 | while j < len(struct_body_lines): |
| 74 | line = struct_body_lines[j] |
| 75 | |
| 76 | if (re_substruct_start.fullmatch(line) |
Neels Hofmeyr | bd58697 | 2020-05-14 17:27:59 +0200 | [diff] [blame] | 77 | or re_substruct_end.fullmatch(line) |
| 78 | or re_unnamed_substruct_end.fullmatch(line)): |
Neels Hofmeyr | 7ab5fc1 | 2018-11-15 23:29:56 +0100 | [diff] [blame] | 79 | end_def() |
| 80 | arbitrary_part.append(line) |
| 81 | j += 1 |
| 82 | continue |
| 83 | |
| 84 | if re_big_endian_ifdef.fullmatch(line): |
| 85 | end_def() |
| 86 | # discard big endian section |
| 87 | j += 1 |
| 88 | while j < len(struct_body_lines): |
| 89 | line = struct_body_lines[j] |
| 90 | if re_endif.fullmatch(line): |
| 91 | end_def() |
| 92 | j += 1 |
| 93 | break; |
| 94 | if re_little_endian_ifdef.fullmatch(line): |
| 95 | end_def() |
| 96 | # keep that start of little endian section, not j++ |
| 97 | break; |
| 98 | if re_else.fullmatch(line): |
| 99 | # there's an '#else' after big-endian. Shim a little-endian header in just for the loop. |
| 100 | struct_body_lines[j] = '#if OSMO_IS_LITTLE_ENDIAN\n' |
| 101 | break; |
| 102 | j += 1 |
| 103 | continue |
| 104 | |
| 105 | if re_little_endian_ifdef.fullmatch(line): |
| 106 | end_def() |
| 107 | j += 1 |
| 108 | while j < len(struct_body_lines): |
| 109 | line = struct_body_lines[j] |
| 110 | if re_endif.fullmatch(line): |
| 111 | end_def() |
| 112 | j += 1 |
| 113 | break; |
| 114 | if re_big_endian_ifdef.fullmatch(line): |
| 115 | end_def() |
| 116 | # keep that start of big endian section, not j++ |
| 117 | break; |
| 118 | if re_else.fullmatch(line): |
| 119 | # there's an '#else' after little-endian. Shim a big-endian header in just for the loop. |
| 120 | struct_body_lines[j] = '#if OSMO_IS_BIG_ENDIAN\n' |
| 121 | break; |
| 122 | def_part.append(line) |
| 123 | j += 1 |
| 124 | |
| 125 | continue |
| 126 | |
| 127 | def_part.append(line) |
| 128 | j += 1 |
| 129 | |
| 130 | # flush the last section remaining that didn't see an explicit end |
| 131 | end_def() |
| 132 | # end_def() only flushes arbitrary_part if there was a def_part, so: |
| 133 | if arbitrary_part: |
| 134 | struct_body_parts.append(arbitrary_part) |
| 135 | |
| 136 | return struct_body_parts |
| 137 | |
| 138 | def struct_body_to_big_endian(body_str): |
| 139 | '''Input: a multi-line string containing the body of a struct, i.e. without |
| 140 | sub-structs and without #if OSMO_IS_BIG_ENDIAN. like |
| 141 | |
| 142 | '\tconst char *foo;\n\tuint8_t moo:3, goo:2;\n\tuint8_t loo:3;\n\tvoid *baz;\n' |
| 143 | |
| 144 | Return None to indicate that there is no little/big endian split |
| 145 | required, or return a multi-line string of the big-endian version of this |
| 146 | same struct body, where sub-byte ints are reversed at byte boundaries, and |
| 147 | all others are copied 1:1. If there are no sub-byte integers, return None, |
| 148 | to indicate that there is no little/big endian split required.''' |
| 149 | |
| 150 | # kick comments out of the code analysis. They will end up being stripped |
| 151 | # from big-endian only. |
| 152 | body_str = remove_c_comments(body_str) |
| 153 | |
| 154 | def_strs = body_str.split(';') |
| 155 | def_strs = ('%s;' % def_str for def_str in def_strs if def_str.strip()) |
| 156 | |
| 157 | # classify defs as containing sub-byte members or not |
| 158 | # defs = [ (true, 'uint8_t ', ('foo:3', 'bar:5')), |
| 159 | # (false, 'int baz;'),...] |
| 160 | defs = [] |
| 161 | any_sub_byte_ints = False |
| 162 | for one_def in def_strs: |
| 163 | |
| 164 | # does it have sub-string integers? |
| 165 | int_def = re_int_def.fullmatch(one_def) |
| 166 | if not int_def: |
| 167 | # not even a number, same for big and little endian |
| 168 | defs.append((False, one_def)) |
| 169 | continue |
| 170 | |
| 171 | int_type = int_def.group(1) |
| 172 | members_str = int_def.groups()[-1] |
| 173 | has_sub_byte_ints = False |
| 174 | |
| 175 | members = [] |
| 176 | for int_member in re_int_members.finditer(members_str): |
| 177 | member = int_member.group(1) |
| 178 | members.append(member) |
| 179 | if ':' in member: |
| 180 | has_sub_byte_ints = True |
| 181 | |
| 182 | if not has_sub_byte_ints: |
| 183 | defs.append((False, one_def)) |
| 184 | else: |
| 185 | defs.append((True, one_def, int_type, members)) |
| 186 | any_sub_byte_ints = True |
| 187 | |
| 188 | if not any_sub_byte_ints: |
| 189 | return None |
| 190 | |
| 191 | # now the interesting part, go over the defs, and reverse the sub-byte ints |
| 192 | # at byte boundaries. |
| 193 | |
| 194 | i = 0 |
| 195 | got_bits = 0 |
| 196 | byte_type = None |
| 197 | members_within_a_byte = [] |
| 198 | big_endian_defs = [] |
| 199 | |
| 200 | big_defs = [] |
| 201 | for classified_def in defs: |
| 202 | has_sub_byte_ints = classified_def[0] |
| 203 | |
| 204 | # now the big endian part |
| 205 | if has_sub_byte_ints: |
| 206 | _, one_def, int_type, members = classified_def |
| 207 | |
| 208 | if byte_type and byte_type.strip() != int_type.strip(): |
| 209 | raise Exception('mismatching type continuation after incomplete byte: %r %r to %r' |
| 210 | % (byte_type, members_within_a_byte, int_type)) |
| 211 | byte_type = int_type |
| 212 | |
| 213 | for member in members: |
| 214 | member_name, bits_str = member.split(':') |
| 215 | member_name = member_name.strip() |
| 216 | bits = int(bits_str) |
| 217 | member = '%s:%d' % (member_name, bits) |
| 218 | members_within_a_byte.append(member) |
| 219 | got_bits += bits |
| 220 | |
| 221 | if got_bits == 8: |
| 222 | # reverse these. |
| 223 | big_endian_defs.append('%s%s;' % (byte_type, ', '.join(reversed(members_within_a_byte)))) |
| 224 | members_within_a_byte = [] |
| 225 | byte_type = None |
| 226 | got_bits = 0 |
| 227 | |
| 228 | elif got_bits > 8: |
| 229 | raise Exception('sub-byte int breaks clean byte bounds: %s -- %d + %d = %d bits' |
| 230 | % (member, got_bits - bits, bits, got_bits)) |
| 231 | |
| 232 | elif not has_sub_byte_ints: |
| 233 | if got_bits: |
| 234 | raise Exception('sub-byte members do not add up to clean byte bounds: %r' % members_within_a_byte) |
| 235 | |
| 236 | big_endian_defs.append(classified_def[1]) |
| 237 | |
| 238 | # strip empty lines |
| 239 | lines = [l for l in (''.join(big_endian_defs).split('\n')) if l.strip()] |
| 240 | # clean lines' whitespace errors we might have taken in with the type names |
| 241 | for i in range(len(lines)): |
| 242 | line = lines[i] |
| 243 | while len(line) and line[-1] in ' \t': |
| 244 | line = line[:-1] |
| 245 | lines[i] = line |
| 246 | return '\n'.join(lines) |
| 247 | |
| 248 | def handle_struct_body(body_str): |
| 249 | |
| 250 | big_endian_body_str = struct_body_to_big_endian(body_str) |
| 251 | |
| 252 | if big_endian_body_str: |
| 253 | new_lines = ['#if OSMO_IS_LITTLE_ENDIAN\n'] |
| 254 | new_lines.append(body_str) |
| 255 | new_lines.append('#elif OSMO_IS_BIG_ENDIAN\n' |
| 256 | '/* auto-generated from the little endian part above (libosmocore/contrib/struct_endianess.py) */\n') |
| 257 | new_lines.append(big_endian_body_str) |
| 258 | new_lines.append('\n#endif\n') |
| 259 | return ''.join(new_lines) |
| 260 | else: |
| 261 | return body_str |
| 262 | |
| 263 | def _check_file(f): |
| 264 | if not (f.endswith('.h') or f.endswith('.c') or f.endswith('.cpp')): |
| 265 | return |
| 266 | |
| 267 | # section the file into |
| 268 | # [ ["no struct def"], ["struct {...};"], ["no struct def"], ... ] |
| 269 | sections = [] |
| 270 | in_struct = False |
| 271 | buf = [] |
| 272 | for line in codecs.open(f, "r", "utf-8").readlines(): |
| 273 | |
| 274 | if not in_struct and re_struct_start.fullmatch(line): |
| 275 | # flush whatever might still be in buf from before |
| 276 | sections.append(buf) |
| 277 | # start an in_struct section |
| 278 | buf = [line] |
| 279 | in_struct = True |
| 280 | elif in_struct and re_struct_end.fullmatch(line): |
| 281 | # add this end to the in_struct section and then start a non-struct section |
| 282 | buf.append(line) |
| 283 | sections.append(buf) |
| 284 | in_struct = False |
| 285 | buf = [] |
| 286 | else: |
| 287 | buf.append(line) |
| 288 | # flush any leftovers in buf |
| 289 | if buf: |
| 290 | sections.append(buf) |
| 291 | |
| 292 | # examine each struct, i.e. every second item in 'sections' |
| 293 | for i in range(len(sections)): |
| 294 | if not (i & 1): |
| 295 | continue |
| 296 | |
| 297 | struct = sections[i] |
| 298 | |
| 299 | # If the struct isn't packed, we need not bother. |
| 300 | # The practical use of this: in some structs we have booleans in the |
| 301 | # form of |
| 302 | # integer flag:1; |
| 303 | # and these don't add up to bytes, and cause errors. So let's skip all |
| 304 | # non-packed structs, then all of those are out of the picture. |
| 305 | if not 'packed' in struct[-1]: |
| 306 | continue |
| 307 | |
| 308 | try: |
| 309 | |
| 310 | # assume the 'struct foo {' is on the first line, the closing brace |
| 311 | # '} __attribute...;' on the last, and the rest are individual |
| 312 | # definitions split by ';'. |
| 313 | struct_body_lines = struct[1:-1] |
| 314 | struct_body_parts = section_struct_body(struct_body_lines) |
| 315 | |
| 316 | new_struct_body_parts = [] |
| 317 | for j in range(len(struct_body_parts)): |
| 318 | part = ''.join(struct_body_parts[j]) |
| 319 | if not (j & 1): |
| 320 | new_struct_body_parts.append(part) |
| 321 | else: |
| 322 | new_struct_body_parts.append(handle_struct_body(part)) |
| 323 | |
| 324 | new_struct = [struct[0], ''.join(new_struct_body_parts), struct[-1]] |
| 325 | sections[i] = new_struct |
| 326 | except Exception as e: |
| 327 | raise Exception('ERROR in struct %r' % struct[0]) |
| 328 | |
| 329 | # phew. result. |
| 330 | result = ''.join((''.join(s) for s in sections)) |
| 331 | |
| 332 | # see if osmocom/core/endian.h is needed and included. |
| 333 | if (not f.endswith('endian.h') |
| 334 | and 'OSMO_IS_LITTLE_ENDIAN' in result |
| 335 | and '#include <osmocom/core/endian.h>' not in result): |
| 336 | # add the include after the last 'osmocom/core' include |
| 337 | last_include_start = result.rfind('#include <osmocom/core/') |
| 338 | if last_include_start < 0: |
| 339 | last_include_start = result.rfind('#include <osmocom/') |
| 340 | if last_include_start < 0: |
| 341 | last_include_start = result.rfind('#include') |
| 342 | |
| 343 | if last_include_start < 0: |
| 344 | raise Exception('do not know where to include osmocom/core/endian.h in %r' % f) |
| 345 | |
| 346 | insert_at = result.find('\n', last_include_start) |
| 347 | |
| 348 | result = result[:insert_at] + '\n#include <osmocom/core/endian.h>' + result[insert_at:] |
| 349 | |
| 350 | with codecs.open(f, "w", "utf-8") as fd: |
| 351 | fd.write(result) |
| 352 | |
| 353 | def check_file(f): |
| 354 | try: |
| 355 | _check_file(f) |
| 356 | except Exception as e: |
| 357 | raise Exception('ERROR IN FILE %r' % f) |
| 358 | |
| 359 | args = sys.argv[1:] |
| 360 | if not args: |
| 361 | args = ['.'] |
| 362 | |
| 363 | for f in args: |
| 364 | if os.path.isdir(f): |
| 365 | for parent_path, subdirs, files in os.walk(f, None, None): |
| 366 | for ff in files: |
| 367 | check_file(os.path.join(parent_path, ff)) |
| 368 | else: |
| 369 | check_file(f) |
| 370 | |
| 371 | # vim: tabstop=4 shiftwidth=4 expandtab |