ext/nkf/nkf.c


DEFINITIONS

This source file includes following functions.
  1. rb_nkf_putchar
  2. rb_nkf_kconv
  3. rb_nkf_guess
  4. Init_nkf


   1  #include "ruby.h"
   2  
   3  #define _AUTO           0
   4  #define _JIS            1
   5  #define _EUC            2
   6  #define _SJIS           3
   7  #define _BINARY         4
   8  #define _NOCONV         4
   9  #define _UNKNOWN        _AUTO
  10  
  11  #undef getc
  12  #undef ungetc
  13  #define getc(f)         (input_ctr<i_len?input[input_ctr++]:-1)
  14  #define ungetc(c,f)     input_ctr--
  15  
  16  #undef putchar
  17  #define putchar(c)      rb_nkf_putchar(c)
  18  
  19  #define INCSIZE         32
  20  static int incsize;
  21  
  22  static unsigned char *input, *output;
  23  static int input_ctr, i_len;
  24  static int output_ctr, o_len;
  25  
  26  static VALUE dst;
  27  
  28  static int
  29  rb_nkf_putchar(c)
  30       unsigned int c;
  31  {
  32    if (output_ctr >= o_len) {
  33      o_len += incsize;
  34      rb_str_cat(dst, 0, incsize);
  35      output = RSTRING(dst)->ptr;
  36      incsize *= 2;
  37    }
  38    output[output_ctr++] = c;
  39  
  40    return c;
  41  }
  42  
  43  #define PERL_XS 1
  44  #include "nkf1.7/nkf.c"
  45  
  46  static VALUE
  47  rb_nkf_kconv(obj, opt, src)
  48       VALUE obj, opt, src;
  49  {
  50    char *opt_ptr, *opt_end;
  51    volatile VALUE v;
  52  
  53    reinit();
  54    StringValue(opt);
  55    opt_ptr = RSTRING(opt)->ptr;
  56    opt_end = opt_ptr + RSTRING(opt)->len;
  57    for (; opt_ptr < opt_end; opt_ptr++) {
  58      if (*opt_ptr != '-') {
  59        continue;
  60      }
  61      arguments(opt_ptr);
  62    }
  63  
  64    incsize = INCSIZE;
  65  
  66    input_ctr = 0; 
  67    StringValue(src);
  68    input = RSTRING(src)->ptr;
  69    i_len = RSTRING(src)->len;
  70    dst = rb_str_new(0, i_len*3 + 10);
  71    v = dst;
  72  
  73    output_ctr = 0;
  74    output     = RSTRING(dst)->ptr;
  75    o_len      = RSTRING(dst)->len;
  76    *output    = '\0';
  77  
  78    if(iso8859_f && (oconv != j_oconv || !x0201_f )) {
  79      iso8859_f = FALSE;
  80    } 
  81  
  82    kanji_convert(NULL);
  83    RSTRING(dst)->ptr[output_ctr] = '\0';
  84    RSTRING(dst)->len = output_ctr;
  85    OBJ_INFECT(dst, src);
  86  
  87    return dst;
  88  }
  89  
  90  /*
  91   * Character code detection - Algorithm described in:
  92   * Ken Lunde. `Understanding Japanese Information Processing'
  93   * Sebastopol, CA: O'Reilly & Associates.
  94   */
  95  
  96  static VALUE
  97  rb_nkf_guess(obj, src)
  98       VALUE obj, src;
  99  {
 100    unsigned char *p;
 101    unsigned char *pend;
 102    int sequence_counter = 0;
 103  
 104    StringValue(src);
 105    p = RSTRING(src)->ptr;
 106    pend = p + RSTRING(src)->len;
 107    if (p == pend) return INT2FIX(_UNKNOWN);
 108  
 109  #define INCR do {\
 110      p++;\
 111      if (p==pend) return INT2FIX(_UNKNOWN);\
 112      sequence_counter++;\
 113      if (sequence_counter % 2 == 1 && *p != 0xa4)\
 114          sequence_counter = 0;\
 115      if (6 <= sequence_counter) {\
 116          sequence_counter = 0;\
 117          return INT2FIX(_EUC);\
 118      }\
 119  } while (0)
 120  
 121    if (*p == 0xa4)
 122      sequence_counter = 1;
 123  
 124    while (p<pend) {
 125      if (*p == '\033') {
 126        return INT2FIX(_JIS);
 127      }
 128      if (*p < '\006' || *p == 0x7f || *p == 0xff) {
 129        return INT2FIX(_BINARY);
 130      }
 131      if (0x81 <= *p && *p <= 0x8d) {
 132        return INT2FIX(_SJIS);
 133      }
 134      if (0x8f <= *p && *p <= 0x9f) {
 135        return INT2FIX(_SJIS);
 136      }
 137      if (*p == 0x8e) {   /* SS2 */
 138        INCR;
 139        if ((0x40 <= *p && *p <= 0x7e) ||
 140            (0x80 <= *p && *p <= 0xa0) ||
 141            (0xe0 <= *p && *p <= 0xfc))
 142          return INT2FIX(_SJIS);
 143      }
 144      else if (0xa1 <= *p && *p <= 0xdf) {
 145        INCR;
 146        if (0xf0 <= *p && *p <= 0xfe)
 147          return INT2FIX(_EUC);
 148        if (0xe0 <= *p && *p <= 0xef) {
 149          while (p < pend && *p >= 0x40) {
 150            if (*p >= 0x81) {
 151              if (*p <= 0x8d || (0x8f <= *p && *p <= 0x9f)) {
 152                return INT2FIX(_SJIS);
 153              }
 154              else if (0xfd <= *p && *p <= 0xfe) {
 155                return INT2FIX(_EUC);
 156              }
 157            }
 158            INCR;
 159          }
 160        }
 161        else if (*p <= 0x9f) {
 162          return INT2FIX(_SJIS);
 163        }
 164      }
 165      else if (0xf0 <= *p && *p <= 0xfe) {
 166        return INT2FIX(_EUC);
 167      }
 168      else if (0xe0 <= *p && *p <= 0xef) {
 169        INCR;
 170        if ((0x40 <= *p && *p <= 0x7e) ||
 171            (0x80 <= *p && *p <= 0xa0)) {
 172          return INT2FIX(_SJIS);
 173        }
 174        if (0xfd <= *p && *p <= 0xfe) {
 175          return INT2FIX(_EUC);
 176        }
 177      }
 178      INCR;
 179    }
 180    return INT2FIX(_UNKNOWN);
 181  }
 182  
 183  void
 184  Init_nkf()
 185  {
 186      VALUE mKconv = rb_define_module("NKF");
 187  
 188      rb_define_module_function(mKconv, "nkf", rb_nkf_kconv, 2);
 189      rb_define_module_function(mKconv, "guess", rb_nkf_guess, 1);
 190  
 191      rb_define_const(mKconv, "AUTO", INT2FIX(_AUTO));
 192      rb_define_const(mKconv, "JIS", INT2FIX(_JIS));
 193      rb_define_const(mKconv, "EUC", INT2FIX(_EUC));
 194      rb_define_const(mKconv, "SJIS", INT2FIX(_SJIS));
 195      rb_define_const(mKconv, "BINARY", INT2FIX(_BINARY));
 196      rb_define_const(mKconv, "NOCONV", INT2FIX(_NOCONV));
 197      rb_define_const(mKconv, "UNKNOWN", INT2FIX(_UNKNOWN));
 198  }