Changeset 1cd3218 in serd


Ignore:
Timestamp:
07/30/17 01:38:24 (4 weeks ago)
Author:
David Robillard <d@…>
Branches:
master, serd1
Children:
30379f4
Parents:
f15d475
Message:

Support strict parsing of prefixed names

Files:
3 edited

Legend:

Unmodified
Added
Removed
  • NEWS

    r32f1075 r1cd3218  
     1serd (0.29.1) unstable; 
     2 
     3  * Support strict parsing of prefixed names 
     4 
     5 -- David Robillard <d@drobilla.net>  Sun, 30 Jul 2017 10:35:24 +0200 
     6 
    17serd (0.28.0) stable; 
    28 
  • src/reader.c

    rc0c776a r1cd3218  
    391391 
    392392static inline SerdStatus 
    393 bad_char(SerdReader* reader, Ref dest, const char* fmt, uint8_t c) 
    394 { 
    395     r_err(reader, SERD_ERR_BAD_SYNTAX, fmt, c); 
    396     push_bytes(reader, dest, replacement_char, 3); 
    397  
     393bad_char(SerdReader* reader, const char* fmt, uint8_t c) 
     394{ 
    398395    // Skip bytes until the next start byte 
    399396    for (uint8_t b = peek_byte(reader); (b & 0x80);) { 
     
    402399    } 
    403400 
    404     return SERD_FAILURE; 
     401    r_err(reader, SERD_ERR_BAD_SYNTAX, fmt, c); 
     402    return reader->strict ? SERD_ERR_BAD_SYNTAX : SERD_FAILURE; 
     403} 
     404 
     405static SerdStatus 
     406read_utf8_bytes(SerdReader* reader, uint8_t bytes[4], uint32_t* size, uint8_t c) 
     407{ 
     408    *size = utf8_num_bytes(c); 
     409    if (*size <= 1 || *size > 4) { 
     410        return bad_char(reader, "invalid UTF-8 start 0x%X\n", c); 
     411    } 
     412 
     413    bytes[0] = c; 
     414    for (unsigned i = 1; i < *size; ++i) { 
     415        if (((bytes[i] = peek_byte(reader)) & 0x80) == 0) { 
     416            return bad_char(reader, "invalid UTF-8 continuation 0x%X\n", 
     417                            bytes[i]); 
     418        } 
     419        eat_byte_safe(reader, bytes[i]); 
     420    } 
     421 
     422    return SERD_SUCCESS; 
    405423} 
    406424 
     
    408426read_utf8_character(SerdReader* reader, Ref dest, uint8_t c) 
    409427{ 
    410     const uint32_t size = utf8_num_bytes(c); 
    411     if (size <= 1 || size > 4) { 
    412         return bad_char(reader, dest, "invalid UTF-8 start 0x%X\n", c); 
    413     } 
    414  
    415     uint8_t bytes[4]; 
    416     bytes[0] = c; 
    417  
    418     // Check character validity 
    419     for (unsigned i = 1; i < size; ++i) { 
    420         if (((bytes[i] = peek_byte(reader)) & 0x80) == 0) { 
    421             return bad_char(reader, dest, "invalid UTF-8 continuation 0x%X\n", 
    422                             bytes[i]); 
    423         } 
    424         eat_byte_safe(reader, bytes[i]); 
    425     } 
    426  
    427     // Emit character 
     428    uint32_t   size; 
     429    uint8_t    bytes[4]; 
     430    SerdStatus st = read_utf8_bytes(reader, bytes, &size, c); 
     431    if (st) { 
     432        push_bytes(reader, dest, replacement_char, 3); 
     433    } else { 
     434        push_bytes(reader, dest, bytes, size); 
     435    } 
     436    return st; 
     437} 
     438 
     439static SerdStatus 
     440read_utf8_code(SerdReader* reader, Ref dest, uint32_t* code, uint8_t c) 
     441{ 
     442    uint32_t   size; 
     443    uint8_t    bytes[4]; 
     444    SerdStatus st = read_utf8_bytes(reader, bytes, &size, c); 
     445    if (st) { 
     446        push_bytes(reader, dest, replacement_char, 3); 
     447        return st; 
     448    } 
     449 
    428450    push_bytes(reader, dest, bytes, size); 
    429     return SERD_SUCCESS; 
     451    *code = parse_counted_utf8_char(bytes, size); 
     452    return st; 
    430453} 
    431454 
     
    602625} 
    603626 
     627static inline bool 
     628is_PN_CHARS_BASE(const uint32_t c) 
     629{ 
     630    return ((c >= 0x00C0 && c <= 0x00D6) || (c >= 0x00D8 && c <= 0x00F6) || 
     631            (c >= 0x00F8 && c <= 0x02FF) || (c >= 0x0370 && c <= 0x037D) || 
     632            (c >= 0x037F && c <= 0x1FFF) || (c >= 0x200C && c <= 0x200D) || 
     633            (c >= 0x2070 && c <= 0x218F) || (c >= 0x2C00 && c <= 0x2FEF) || 
     634            (c >= 0x3001 && c <= 0xD7FF) || (c >= 0xF900 && c <= 0xFDCF) || 
     635            (c >= 0xFDF0 && c <= 0xFFFD) || (c >= 0x10000 && c <= 0xEFFFF)); 
     636} 
     637 
    604638static SerdStatus 
    605639read_PN_CHARS_BASE(SerdReader* reader, Ref dest) 
    606640{ 
    607     const uint8_t c = peek_byte(reader); 
    608     if ((c & 0x80)) {  // Multi-byte character 
    609         return read_utf8_character(reader, dest, eat_byte_safe(reader, c)); 
    610     } else if (is_alpha(c)) { 
     641    uint32_t      code; 
     642    const uint8_t c  = peek_byte(reader); 
     643    SerdStatus    st = SERD_SUCCESS; 
     644    if (is_alpha(c)) { 
    611645        push_byte(reader, dest, eat_byte_safe(reader, c)); 
    612         return SERD_SUCCESS; 
    613     } 
    614     return SERD_FAILURE; 
     646    } else if (!(c & 0x80)) { 
     647        return SERD_FAILURE; 
     648    } else if ((st = read_utf8_code(reader, dest, &code, 
     649                                    eat_byte_safe(reader, c)))) { 
     650        return st; 
     651    } else if (reader->strict && !is_PN_CHARS_BASE(code)) { 
     652        return r_err(reader, SERD_ERR_BAD_SYNTAX, 
     653                     "invalid character U+%04X in name\n", code); 
     654    } 
     655    return st; 
     656} 
     657 
     658static inline bool 
     659is_PN_CHARS(const uint32_t c) 
     660{ 
     661    return (is_PN_CHARS_BASE(c) || c == 0xB7 || 
     662            (c >= 0x0300 && c <= 0x036F) || (c >= 0x203F && c <= 0x2040)); 
    615663} 
    616664 
     
    618666read_PN_CHARS(SerdReader* reader, Ref dest) 
    619667{ 
     668    uint32_t      code; 
    620669    const uint8_t c = peek_byte(reader); 
    621     if ((c & 0x80)) {  // Multi-byte character 
    622         return read_utf8_character(reader, dest, eat_byte_safe(reader, c)); 
    623     } else if (is_alpha(c) || is_digit(c) || c == '_' || c == '-') { 
     670    SerdStatus    st = SERD_SUCCESS; 
     671    if (is_alpha(c) || is_digit(c) || c == '_' || c == '-') { 
    624672        push_byte(reader, dest, eat_byte_safe(reader, c)); 
    625         return SERD_SUCCESS; 
    626     } 
    627     return SERD_FAILURE; 
     673    } else if (!(c & 0x80)) { 
     674        return SERD_FAILURE; 
     675    } else if ((st = read_utf8_code(reader, dest, &code, 
     676                                    eat_byte_safe(reader, c)))) { 
     677        return st; 
     678    } else if (reader->strict && !is_PN_CHARS(code)) { 
     679        r_err(reader, (st = SERD_ERR_BAD_SYNTAX), 
     680              "invalid character U+%04X in name\n", code); 
     681    } 
     682    return st; 
    628683} 
    629684 
     
    689744        } else if ((st = read_PLX(reader, dest)) > SERD_FAILURE) { 
    690745            return st; 
    691         } else if (st != SERD_SUCCESS && read_PN_CHARS(reader, dest)) { 
     746        } else if (st != SERD_SUCCESS && (st = read_PN_CHARS(reader, dest))) { 
    692747            break; 
    693748        } 
     
    702757    } 
    703758 
    704     return SERD_SUCCESS; 
     759    return (st > SERD_FAILURE) ? st : SERD_SUCCESS; 
    705760} 
    706761 
  • wscript

    r32f1075 r1cd3218  
    1212# minor increment <=> compatible changes (additions) 
    1313# micro increment <=> no interface changes 
    14 SERD_VERSION       = '0.28.0' 
     14SERD_VERSION       = '0.29.1' 
    1515SERD_MAJOR_VERSION = '0' 
    1616 
     
    487487    autowaf.begin_tests(ctx, APPNAME, 'bad') 
    488488    for test in bad_tests: 
    489         for lax in ['', '-l']: 
     489        for lax in ['']: 
    490490            autowaf.run_test( 
    491491                ctx, APPNAME, 
Note: See TracChangeset for help on using the changeset viewer.