Changeset b354c6b519a44d37e66bbfe4571631c26e55ddeb

Show
Ignore:
Timestamp:
06/21/2008 02:54:18 PM (2 months ago)
Author:
mitchell <mitchell@frost.(none)>
git-committer:
mitchell <mitchell@frost.(none)> 1214085258 -0400
git-parent:

[252d76fbd59b83e18b43397931b40a92563eadea]

git-author:
mitchell <mitchell@frost.(none)> 1214085258 -0400
Message:

Fixed issue with Ragel executing actions in real time; fixed some parsers.
Added a queue for callbacks in entities that span multiple lines and may not
have end delimitters. Ragel will backtrack if the complete match fails, so if
it's the case, the false callbacks will not be called. If the end delimitter is
reached, callbacks in the queue are committed and executed.

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • PARSER_DOC

    r78421f8 rb354c6b  
    221221      defining line_start. 
    222222 
     223      Also for multi-line matches, it may be necessary to use the 'queue' and 
     224      'commit' actions. If it is possible that a multi-line entity will not have 
     225      an ending delimiter (for example a string), use the 'queue' action as soon 
     226      as the start delimitter has been detected, and the 'commit' action as soon 
     227      as the end delimitter has been detected. This will eliminate the potential 
     228      for any counting errors. 
     229 
    223230    Notes: 
    224231      * You can be a bit sloppy with the line counting machine. For example the 
     
    298305      } 
    299306 
     307    Note: the 'ls', 'code', 'comment', 'queue' and 'commit' actions are 
     308    completely unnecessary. 
     309 
    300310    Parsers for Embedded Languages: 
    301311      TODO: 
  • ext/ohcount_native/ragel_parser_macros.h

    ra3e0e88 rb354c6b  
    22#define RAGEL_PARSER_MACROS 
    33 
    4 /* Sets the line_start variable to ts. 
     4/** 
     5 * Callback struct for queues. 
     6 * @field lang The Language name. 
     7 * @field entity The entity name. 
     8 * @field s The start position of the entity in the buffer. 
     9 * @field e The end position of the entity in the buffer. 
     10 * @field next The next Callback in the queue. 
     11 */ 
     12typedef struct callback_list_item { 
     13  const char *lang; // language name; should NOT be freed 
     14  const char *entity; // entity name; should NOT be freed 
     15  int s, e; // start and end positions of the entity in the buffer (use cint) 
     16  struct callback_list_item *next; // the next callback to call 
     17} Callback; 
     18 
     19Callback *callback_list_head = NULL; 
     20Callback *callback_list_tail = NULL; 
     21 
     22/** 
     23 * Enqueues a callback for calling upon commit. 
     24 * This is only necessary for line counting machines. 
     25 * Ragel will execute actions in real-time rather than after a complete match. 
     26 * This is a problem for entities that contain internal newlines, since there is 
     27 * a callback for each internal newline whether or not the end of the entity 
     28 * matches. This means that if, for example, the beginning of a string entity is 
     29 * matched, the text following is treated as code until the ending delimiter. If 
     30 * there is no ending delimiter (it was not actually a string entity), Ragel 
     31 * will jump back to the beginning of the string and reparse the text again. 
     32 * This means all the callbacks called were probably not accurate. 
     33 * To remedy this, any entity which needs an ending delimiter that may not 
     34 * appear will have its callbacks enqueued and then committed when the ending 
     35 * delimitter is reached. If that delimitter is not reached, the callbacks are 
     36 * never called. 
     37 * @param lang The language name. 
     38 * @param entity The entity (lcode, lcomment, lblank). 
     39 * @param s The start position of the entity in the buffer. 
     40 * @param e The end position of the entity in the buffer. 
     41 */ 
     42void enqueue(const char *lang, const char *entity, int s, int e) { 
     43  Callback *item = (Callback *) malloc(sizeof(Callback)); 
     44  //assert(item != NULL); // trap malloc errors 
     45 
     46  item->lang = lang; 
     47  item->entity = entity; 
     48  item->s = s; 
     49  item->e = e; 
     50  item->next = NULL; 
     51 
     52  if (!callback_list_head) { 
     53    callback_list_head = item; 
     54    callback_list_tail = item; 
     55  } else { 
     56    callback_list_tail->next = item; 
     57    callback_list_tail = item; 
     58  } 
     59
     60 
     61/** Frees the memory used by a queue. */ 
     62void free_queue() { 
     63  Callback *item = callback_list_head; 
     64  while (item != NULL) { 
     65    Callback *next = item->next; 
     66    free(item); 
     67    item = next; 
     68  } 
     69
     70 
     71/** 
     72 * Restores settings for a failed enqueued entity. 
     73 * This is typically used in the ls, code, and comment macros. 
     74 */ 
     75#define dequeue { \ 
     76  inqueue = 0; \ 
     77  line_start = last_line_start; \ 
     78  line_contains_code = last_line_contains_code; \ 
     79  whole_line_comment = last_whole_line_comment; \ 
     80
     81 
     82/** 
     83 * Sets the line_start variable to ts. 
    584 * This is typically used for the SPACE entity in the main action. 
    685 */ 
    7 #define ls { if (!line_start) line_start = ts; } 
    8  
    9 /* The C equivalent of the Ragel 'code' action. 
     86#define ls { \ 
     87  if (inqueue) { dequeue; } \ 
     88  if (!line_start) line_start = ts; \ 
     89
     90 
     91/** 
     92 * The C equivalent of the Ragel 'code' action. 
    1093 * This is tyically used in the main action for entities where Ragel actions 
    1194 * cannot, for one reason or another, be used. 
    1295 */ 
    1396#define code { \ 
     97  if (inqueue) { dequeue; } \ 
    1498  if (!line_contains_code && !line_start) line_start = ts; \ 
    1599  line_contains_code = 1; \ 
    16100} 
    17101 
    18 /* The C equivalent of the Ragel 'comment' action. 
     102/** 
     103 * The C equivalent of the Ragel 'comment' action. 
    19104 * This is typically unused, but here for consistency. 
    20105 */ 
    21106#define comment { \ 
     107  if (inqueue) { dequeue; } \ 
    22108  if (!line_contains_code) { \ 
    23109    whole_line_comment = 1; \ 
     
    26112} 
    27113 
    28 /* Sets up for having seen an embedded language. 
     114/** 
     115 * Sets up for having seen an embedded language. 
    29116 * This is typically used when entering an embedded language which usually does 
    30117 * not span multiple lines (e.g. php for <?php echo 'blah' ?> on single lines) 
     
    38125} 
    39126 
    40 /* Executes standard line counting actions for INTERNAL_NL entities. 
     127/** 
     128 * Executes standard line counting actions for INTERNAL_NL entities. 
    41129 * This is typically used in the main action for the INTERNAL_NL entity. 
    42130 * @param lang The language name string. 
     
    44132#define std_internal_newline(lang) { \ 
    45133  if (callback && p > line_start) { \ 
    46     if (line_contains_code) \ 
    47       callback(lang, "lcode", cint(line_start), cint(p)); \ 
    48     else if (whole_line_comment) \ 
    49       callback(lang, "lcomment", cint(line_start), cint(p)); \ 
    50     else \ 
    51       callback(lang, "lblank", cint(line_start), cint(p)); \ 
     134    if (line_contains_code) { \ 
     135      if (inqueue) \ 
     136        enqueue(lang, "lcode", cint(line_start), cint(p)); \ 
     137      else \ 
     138        callback(lang, "lcode", cint(line_start), cint(p)); \ 
     139    } else if (whole_line_comment) { \ 
     140      if (inqueue) \ 
     141        enqueue(lang, "lcomment", cint(line_start), cint(p)); \ 
     142      else \ 
     143        callback(lang, "lcomment", cint(line_start), cint(p)); \ 
     144    } else { \ 
     145      if (inqueue) \ 
     146        enqueue(lang, "lblank", cint(line_start), cint(p)); \ 
     147      else \ 
     148        callback(lang, "lblank", cint(line_start), cint(p)); \ 
     149    } \ 
    52150  } \ 
    53151  whole_line_comment = 0; \ 
     
    56154} 
    57155 
    58 /* Executes emebedded language line counting actions for INTERNAL_NL entities 
     156/** 
     157 * Executes emebedded language line counting actions for INTERNAL_NL entities 
    59158 * based on whether or not the embedded language's code has been seen in a 
    60159 * parent line. 
     
    70169} 
    71170 
    72 /* Executes standard line counting actions for NEWLINE entities. 
     171/** 
     172 * Executes standard line counting actions for NEWLINE entities. 
    73173 * This is typically used in the main action for the NEWLINE entity. 
    74174 * @param lang The language name string. 
     
    88188} 
    89189 
    90 /* Executes embedded language line counting actions for NEWLINE entities based 
     190/** 
     191 * Executes embedded language line counting actions for NEWLINE entities based 
    91192 * on whether or not the embedded language's code has been seen in a parent 
    92193 * line. 
     
    102203} 
    103204 
    104 /* Processes the last line for buffers that don't have a newline at EOF. 
     205/** 
     206 * Processes the last line for buffers that don't have a newline at EOF. 
    105207 * This is typically used at the end of the parse_lang function after the Ragel 
    106208 * parser has been executed. 
     
    116218} 
    117219 
    118 /* Determines whether or not the rest of the line is blank. 
     220/** 
     221 * Determines whether or not the rest of the line is blank. 
    119222 * This is typically used when entering an embedded language. 
    120223 * @param p The position of entry into the emebedded language. 
     
    133236} 
    134237 
    135 /* If there is a transition into an embedded language and there is only parent 
     238/** 
     239 * If there is a transition into an embedded language and there is only parent 
    136240 * language code on the line (the rest of the line is blank with no child code), 
    137241 * count the line as a line of parent code. 
     
    180284// keeps track of an embedded language 
    181285const char *seen; 
     286 
     287// whether or not to enqueue callbacks instead of calling them in real time 
     288int inqueue; 
     289 
     290// backups for 'inqueue'ing 
     291char *last_line_start; 
     292int last_line_contains_code, last_whole_line_comment; 
    182293 
    183294#define init { \ 
     
    192303  entity = 0; \ 
    193304  seen = 0; \ 
     305  inqueue = 0; \ 
    194306} 
    195307 
  • ext/ohcount_native/ragel_parsers/common.rl

    r0b4bd62 rb354c6b  
    2727 
    2828# common actions 
     29 
     30action queue { 
     31  inqueue = 1; 
     32  free_queue(); // free the current queue 
     33  callback_list_head = NULL; 
     34  callback_list_tail = NULL; 
     35  // set backup variables 
     36  last_line_start = line_start; 
     37  last_line_contains_code = line_contains_code; 
     38  last_whole_line_comment = whole_line_comment; 
     39} 
     40 
     41action commit { 
     42  if (inqueue) { 
     43    Callback *item; 
     44    for (item = callback_list_head; item != NULL; item = item->next) 
     45      callback(item->lang, item->entity, item->s, item->e); 
     46    free_queue(); 
     47    inqueue = 0; 
     48  } 
     49} 
    2950 
    3051action ls { if (!line_start) line_start = ts; } 
  • ext/ohcount_native/ragel_parsers/html.rl

    r0b4bd62 rb354c6b  
    6464    )* :>> '-->'; 
    6565 
    66   html_sq_str = 
    67     '\'' @code ( 
    68       newline %{ entity = INTERNAL_NL; } %html_ccallback 
    69       | 
    70       ws 
    71       | 
    72       [^\r\n\f\t '\\] @code 
    73       | 
    74       '\\' nonnewline @code 
    75     )* '\''; 
    76   html_dq_str = 
    77     '"' @code ( 
    78       newline %{ entity = INTERNAL_NL; } %html_ccallback 
    79       | 
    80       ws 
    81       | 
    82       [^\r\n\f\t "\\] @code 
    83       | 
    84       '\\' nonnewline @code 
    85     )* '"'; 
     66  html_sq_str = '\'' ([^\r\n\f'\\] | '\\' nonnewline)* '\'' @code; 
     67  html_dq_str = '"' ([^\r\n\f"\\] | '\\' nonnewline)* '"' @code; 
    8668  html_string = html_sq_str | html_dq_str; 
    8769 
  • ext/ohcount_native/ragel_parsers/perl.rl

    r0b4bd62 rb354c6b  
    8888      '\\' nonnewline @code 
    8989    )* '`' @code; 
    90   perl_regex = '/' ([^\r\n\f\t /\\] | '\\' nonnewline)* '/' @code; 
     90  perl_regex = '/' ([^\r\n\f/\\] | '\\' nonnewline)* '/' @code; 
    9191  # TODO: heredoc detection 
    9292  # This is impossible with current Ragel. We need to extract what the end 
  • ext/ohcount_native/ragel_parsers/ruby.rl

    r0b4bd62 rb354c6b  
    4949  # Can't do that now because using 'when starts_line' fails a Ragel assertion. 
    5050  ruby_block_comment = 
    51     '=begin' @comment ( 
     51    '=begin' @queue @comment ( 
    5252      newline %{ entity = INTERNAL_NL; } %ruby_ccallback 
    5353      | 
     
    5555      | 
    5656      (nonnewline - ws) @comment 
    57     )* :>> '=end'
     57    )* :>> '=end' @commit
    5858  ruby_comment = ruby_line_comment | ruby_block_comment; 
    5959 
    6060  ruby_sq_str = 
    61     '\'' @code ( 
     61    '\'' @queue @code ( 
    6262      newline %{ entity = INTERNAL_NL; } %ruby_ccallback 
    6363      | 
     
    6767      | 
    6868      '\\' nonnewline @code 
    69     )* '\'' @code; 
     69    )* '\'' @commit @code; 
    7070  ruby_dq_str = 
    71     '"' @code ( 
     71    '"' @queue @code ( 
    7272      newline %{ entity = INTERNAL_NL; } %ruby_ccallback 
    7373      | 
     
    7777      | 
    7878      '\\' nonnewline @code 
    79     )* '"' @code; 
     79    )* '"' @commit @code; 
    8080  # TODO: true literal string detection 
    8181  # Turns out any non-alphanum char can be after the initial '%' for a literal 
     
    8888  # closing char in the literal string below. 
    8989  ruby_lit_str = 
    90     '%' [qQ]? [(\[{] @code ( 
     90    '%' [qQ]? [(\[{] @queue @code ( 
    9191      newline %{ entity = INTERNAL_NL; } %ruby_ccallback 
    9292      | 
     
    9696      | 
    9797      '\\' nonnewline @code 
    98     )* [)\]}] @code; 
     98    )* [)\]}] @commit @code; 
    9999  ruby_cmd_str = 
    100     '`' @code ( 
     100    '`' @queue @code ( 
    101101      newline %{ entity = INTERNAL_NL; } %ruby_ccallback 
    102102      | 
     
    106106      | 
    107107      '\\' nonnewline @code 
    108     )* '`' @code; 
    109   ruby_regex = 
    110     '/' @code ( 
    111       newline %{ entity = INTERNAL_NL; } %ruby_ccallback 
    112       | 
    113       ws 
    114       | 
    115       [^\r\n\f\t /\\] @code 
    116       | 
    117       '\\' nonnewline @code 
    118     )* '/' @code; 
     108    )* '`' @commit @code; 
     109  ruby_regex = '/' ([^\r\n\f\t /\\] | '\\' nonnewline)* '/' @code; 
    119110  # TODO: true literal array and command detection 
    120111  # See TODO above about literal string detection 
    121112  ruby_lit_other = 
    122     '%' [wrx] [(\[{] @code ( 
     113    '%' [wrx] [(\[{] @queue @code ( 
    123114      newline %{ entity = INTERNAL_NL; } %ruby_ccallback 
    124115      | 
     
    128119      | 
    129120      '\\' nonnewline @code 
    130     )* [)\]}] @code; 
     121    )* [)\]}] @commit @code; 
    131122  # TODO: heredoc detection 
    132123  # This is impossible with current Ragel. We need to extract what the end