Friday, July 16, 2010

CLOB to VARCHAR2 conversion problems on UTF8 databases

Some well-known facts:
  • VARCHAR2 has a max size of 32767 characters
  • UTF8 is a multi-byte character set, one character uses up to 4 bytes
  • 32767/4 = 8191.75
It looks like there are conversion problems when copying from a CLOB to a VARCHAR2 on UTF8 databases. If the VARCHAR2 gets assigned a CLOB (either directly or via SUBSTR(clob,...)) of more than 8191 characters, this raises VALUE_ERROR (see example below). If reading via DBMS_LOB.SUBSTR, the result gets truncated to 8191 characters. Reading via DBMS_LOB.READ works as expected.

An annoying inconsistency. DB was 11.1.0.6 on Windows.

CN@cn> select value from v$nls_parameters where parameter='NLS_CHARACTERSET';



VALUE
----------------------------------------------------------------
WE8MSWIN1252
 

CN@cn> declare
2 c clob;
3 v varchar2(10000);
4 begin
5 for i in 8191 .. 8192 loop
6 v := rpad('abc',i);
7 c := v;
8 dbms_output.put_line('Trying length '||i||'...');
9 v := c;
10 dbms_output.put_line('...ok');
11 end loop;
12 end;
13 /
Trying length 8191...
...ok
Trying length 8192...
...ok
PL/SQL-Prozedur erfolgreich abgeschlossen.
 

CN@cn> conn cn/test@cnutf
Connect durchgef▒hrt.
CN@cnutf> select value from v$nls_parameters where parameter='NLS_CHARACTERSET';
 

VALUE
----------------------------------------------------------------
AL32UTF8
CN@cnutf> declare

2 c clob;
3 v varchar2(10000);
4 begin
5 for i in 8191 .. 8192 loop
6 v := rpad('abc',i);
7 c := v;
8 dbms_output.put_line('Trying length '||i||'...');
9 v := c;
10 dbms_output.put_line('...ok');
12 end;
13 /
Trying length 8191...
...ok
Trying length 8192...
declare
*
FEHLER in Zeile 1:
ORA-06502: PL/SQL: numerischer oder Wertefehler
ORA-06512: in Zeile 9

Wednesday, July 7, 2010

Writing parsers with m4o

This is a short example of my latest addition to the m4o toolchain. With m4o_lexer you can quite easily write simple LL(1) recursive descent parsers for domain-specific languages in PL/SQL. The lexer is implemented via regexp_substr and regexp_instr. This probably means that performance is not great, but for small code that's not an issue.

@package-begin sample_parser
  @plsql
    c_equal constant m4o_lexer.token_t := 1;
    c_plus  constant m4o_lexer.token_t := 2;
    c_begin constant m4o_lexer.token_t := 3;
    c_end   constant m4o_lexer.token_t := 4;
    c_if    constant m4o_lexer.token_t := 5;
    c_then  constant m4o_lexer.token_t := 6;
    c_else  constant m4o_lexer.token_t := 7;
    c_ident constant m4o_lexer.token_t := 99;
    c_num   constant m4o_lexer.token_t := 100;
    procedure parse_stmtseq;
  @end
--------------------------------------------------------------------------------
  @procedure parse_assign
  @declare
    v_var   varchar2(30);
    v_value varchar2(30);
  @begin
    -- [ident] = [num]
    v_var   := m4o_lexer.text;
    m4o_lexer.eat(c_ident);
    m4o_lexer.eat(c_equal);
    v_value := m4o_lexer.text;
    m4o_lexer.eat(c_num);
    dbms_output.put_line('assigning '
                       ||v_value
                       ||' to '
                       ||v_var);
  @end
--------------------------------------------------------------------------------
  @procedure parse_if
  @begin
    m4o_lexer.eat(c_if);
    m4o_lexer.eat(c_ident);
    m4o_lexer.eat(c_equal);
    m4o_lexer.eat(c_num);
    m4o_lexer.eat(c_then);
    parse_stmtseq;
    if m4o_lexer.cur = c_else then
      m4o_lexer.eat;
      parse_stmtseq;
    end if;
    m4o_lexer.eat(c_end);
  @end
--------------------------------------------------------------------------------
  @procedure parse_stmtseq
  @begin
    loop
      case m4o_lexer.cur
      when c_if then
        parse_if;
      when c_ident then
        parse_assign;
      when c_begin then
        m4o_lexer.eat;
        parse_stmtseq;
        m4o_lexer.eat(c_end);
      else
        exit;
      end case;
    end loop;
  @end
--------------------------------------------------------------------------------
  @procedure parse*
    i_code in varchar2
  @begin
    m4o_lexer.begin_define_tokens;
    m4o_lexer.set_whitespace('[[:space:]]+');
    m4o_lexer.set_token(c_equal,'=');
    m4o_lexer.set_token(c_plus ,'\\+');
    m4o_lexer.set_token(c_begin,'begin');
    m4o_lexer.set_token(c_end  ,'end');
    m4o_lexer.set_token(c_if   ,'if');
    m4o_lexer.set_token(c_then ,'then');
    m4o_lexer.set_token(c_else ,'else');
    m4o_lexer.set_token(c_ident,'[a-z][a-z0-9_#$]*');
    m4o_lexer.set_token(c_num  ,'[0-9]+');

    m4o_lexer.begin_reading(i_code);

    m4o_lexer.eat(c_begin);
    parse_stmtseq;
    m4o_lexer.eat(c_end);

    m4o_lexer.end_reading;
  @end
--------------------------------------------------------------------------------
  @procedure main*
  @begin
    parse(
      'begin
        i = 3
        IF FOO = 7 THEN
          I = 5
          bar = 10
        else begin b=0 end end
      end');
  @end
@package-end