tmail/tmail/mails/mails.c

/* vi:set sw=4 ts=4:

    mails.c version 0.1.1

    Copyright (c) 1998,1999 Minero Aoki <aamine@dp.u-netsurf.ne.jp>

    This program is free software.
    You can distribute/modify this program under the terms of
    the GNU Lesser General Public License version 2 or later.

*/

#include <stdio.h>
#ifdef __STDC__
# include <stdlib.h>
#endif

#include "ruby.h"


static VALUE MailScanner;
static VALUE ScanError;


struct mails
{
    unsigned char *ptr;
    unsigned char *p;
    unsigned char *pend;
    unsigned int flags;
    VALUE comments;
};

#define MODE_MIME (1 << 0)
#define MODE_RECV (1 << 1)

#define MIME_MODE_P(s) ((s)->flags & MODE_MIME)
#define RECV_MODE_P(s) ((s)->flags & MODE_RECV)

#define MAILS_DEBUG (1 << 4)


#define BUFSIZE 256

#define GET_SCANNER(val, s) Data_Get_Struct(val, struct mails, s)


static void
real_reset(sc)
    struct mails *sc;
{
    sc->ptr = 0;
    sc->p = 0;
    sc->pend = 0;
    sc->flags = 0;
    sc->comments = Qnil;
}


static void
mails_free(sc)
    struct mails *sc;
{
    free(sc);
}


static VALUE
mails_s_new(klass, str, ident, cmt)
    VALUE klass;
{
    struct mails *sc;
    char *tmp;

    sc = ALLOC_N(struct mails, 1);
    real_reset(sc);

    Check_Type(str, T_STRING);
    sc->ptr = sc->p = RSTRING(str)->ptr;
    sc->pend = sc->p + RSTRING(str)->len;

    tmp = STR2CSTR(ident);
    if      (strcmp(tmp, "RecvH") == 0)         sc->flags |= MODE_RECV;
    else if (strcmp(tmp, "CTypeH") == 0)        sc->flags |= MODE_MIME;
    else if (strcmp(tmp, "CEncodingH") == 0)    sc->flags |= MODE_MIME;
    else if (strcmp(tmp, "CDispositionH") == 0) sc->flags |= MODE_MIME;

    if (! NIL_P(cmt)) {
        Check_Type(cmt, T_ARRAY);
        sc->comments = cmt;
    }
    else {
        sc->comments = Qnil;
    }

    return Data_Wrap_Struct(MailScanner, 0, mails_free, sc);
}


static VALUE
mails_debug_get(self)
    VALUE self;
{
    struct mails *sc;

    GET_SCANNER(self, sc);
    if (sc->flags & MAILS_DEBUG)
        return Qtrue;
    else
        return Qfalse;
}


static VALUE
mails_debug_set(self, flag)
    VALUE self, flag;
{
    struct mails *sc;

    GET_SCANNER(self, sc);
    if (RTEST(flag))
        sc->flags |= MAILS_DEBUG;
    else
        sc->flags &= ~MAILS_DEBUG;
    return Qnil;
}


/* --------- scan functions ----------- */


/* this implement is VERY agree, but usually useful. */

#define ESC '\033'

/* skip until "\e(B" (ascii) */

static void
fwd_jis(sc)
    struct mails *sc;
{
    for (; sc->p < sc->pend; sc->p++) {
        if (*sc->p == ESC) {
            if (strcmp(sc->p, "\033(B") == 0) {
                sc->p += 3;
                break;
            }
        }
    }
}

#define IS_JCHAR(ch) ((ch) > 127)

static void
fwd_jstr(sc)
    struct mails *sc;
{
    while (sc->p < sc->pend) {
        if (*sc->p > 127) {
            sc->p++;
            if (sc->p < sc->pend) sc->p++;
        }
        else return;
    }
}


static VALUE
scan_atom(sc)
    struct mails *sc;
{
    unsigned char *ret;

    ret = sc->p;
    while (sc->p < sc->pend) {
        switch (*sc->p) {
        case '(':
        case ')':
        case '<':
        case '>':
        case '@':
        case ',':
        case ';':
        case ':':
        case '\\':
        case '\"':
        case '.':
        case '[':
        case ']':
            goto retval;

        case ESC:
            fwd_jis(sc);
            continue;

        default:
            if (*sc->p > 127) {
                fwd_jstr(sc);
                continue;
            }
            else if (*sc->p <= 32) {
                goto retval;
            }
            break;
        }
        sc->p++;
    }

retval:

    return rb_str_new(ret, sc->p - ret);
}


static VALUE
scan_token(sc)
    struct mails *sc;
{
    unsigned char *ret;

    ret = sc->p;
    while (sc->p < sc->pend) {
        switch (*sc->p) {
        case '(':
        case ')':
        case '<':
        case '>':
        case '@':
        case ',':
        case ';':
        case ':':
        case '\\':
        case '\"':
        case '/':
        case '[':
        case ']':
        case '?':
        case '=':
            goto retval;
            break;

        case ESC:
            fwd_jis(sc);
            continue;

        default:
            if (*sc->p > 127)
                fwd_jstr(sc);
            else if (*sc->p <= 32)
                goto retval;
            break;
        }
        sc->p++;
    }

retval:
    
    return rb_str_new(ret, sc->p - ret);
}


static VALUE
scan_quoted(sc)
    struct mails *sc;
{
    unsigned char *ret;
    unsigned char buf[BUFSIZE];

    if (*sc->p != '\"')
        return Qnil;
    sc->p++;
    
    ret = buf;
    while (sc->p < sc->pend) {
        switch (*sc->p) {
        case '\"':
            sc->p++;
            goto retval;
            break;

        case '\\':
            sc->p++;
            break;
        }
        *ret = *sc->p;
        ret++; sc->p++;
    }

retval:

    return rb_str_new(buf, ret - buf);
}


static VALUE
scan_domlit(sc)
    struct mails *sc;
{
    unsigned char *ret;
    unsigned char buf[BUFSIZE];

    ret = buf;
    while (sc->p < sc->pend) {
        switch (*sc->p) {
        case ']':
            sc->p++;
            goto retval;
            break;

        case '\\':
            sc->p++;
            break;
        }
        *ret = *sc->p;
        ret++; sc->p++;
    }

retval:

    return rb_str_new(buf, ret - buf);
}
    

/*
static VALUE
scan_digit(sc)
    struct mails *sc;
{
    unsigned char *ret;

    ret = sc->p;
    while (sc->p < sc->pend) {
        switch (*sc->p) {
        case '0':
        case '1':
        case '2':
        case '3':
        case '4':
        case '5':
        case '6':
        case '7':
        case '8':
        case '9':
            break;
        default:
            goto retval;
        }
        sc->p++;
    }

retval:
    
    return rb_str_new(ret, sc->p - ret);
}
*/
            

/*
static VALUE
scan_boundary(sc)
    struct mails *sc;
{
    unsigned char *ret;

    ret = sc->p;
    while (sc->p < sc->pend) {
        switch (*sc->p) {
        case ' ':
        case '\'':
        case '(':
        case ')':
        case '+':
        case '_':
        case ',':
        case '-':
        case '.':
        case '/':
        case ':':
        case '=':
        case '?':
            break;
        default:
            if (*sc->p >= 'A' && *sc->p <= 'Z') break;
            if (*sc->p >= 'a' && *sc->p <= 'z') break;
            if (*sc->p >= '0' && *sc->p <= '9') break;

            goto retval;
        }
    }

retval:

    return rb_str_new(ret, sc->p - ret);
}
*/


static void
fwd_eol(sc)
    struct mails *sc;
{
    if (*sc->p == '\n') {
        sc->p++;
    }
    else if (*sc->p == '\r') {
        sc->p++;
        if (*sc->p == '\n')
            sc->p++;
    }

    return;
}


static VALUE
scan_comment(sc)
    struct mails *sc;
{
    int nest = 0;
    unsigned char *ret;

    ret = sc->p;
    while (sc->p < sc->pend) {
        if (*sc->p == ESC) {
            fwd_jis(sc);
            continue;
        }
        if (IS_JCHAR(*sc->p)) {
            fwd_jstr(sc);
            continue;
        }
        switch (*sc->p) {
        case '(':
            nest++;
            break;
        case ')':
            nest--;
            if (nest == 0) {
                sc->p++;
                goto retval;
            }
            break;
        case '\\':
            sc->p++;
            break;
        case '\r':
        case '\n':
            fwd_eol(sc);
            continue;
        }
        sc->p++;
    }

    rb_raise(ScanError, "unterminated comment");

retval:

    return rb_str_new(ret + 1, sc->p - ret - 2);
}


static int
findin(ch, str)
    char ch;
    char *str;
{
    for (; *str; str++) {
        if (*str == ch) return 1;
    }
    return 0;
}

#define IS_LWSP(ch) (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r')
#define IS_DIGIT(ch) (ch >= '0' && ch <= '9')
#define IS_ATOMCHAR(ch) \
    ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || \
     (ch >= '0' && ch <= '9') || (ch == ESC) || IS_JCHAR(ch) || \
     findin(ch, "#!$%&'`*+-{|}~^/=?"))

#define IS_TOKENCHAR(ch) \
    ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || \
     (ch >= '0' && ch <= '9') || (ch == ESC) || IS_JCHAR(ch) || \
     findin(ch, "#!$%&'`*+-{|}~^."))


static void
skip_lwsp(sc)
    struct mails *sc;
{
    while (sc->p < sc->pend) {
        if (IS_LWSP(*sc->p)) sc->p++;
        else break;
    }
}

#define IS_ALPHA(ch) ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch >= 'Z'))
#define IS_UPPER(ch) (ch >= 'A' && ch >= 'Z')
#define TO_LOWER(ch) (IS_UPPER(ch) ? ch + 32 : ch)

static int
nccmp(a, b)
    char *a, *b;
{
    while (*a && *b) {
        if (*a != *b)
            if (TO_LOWER(*a) != TO_LOWER(*b))
                return 0;
        a++;
        b++;
    }
    return (*a == *b) ? 1 : 0;
}

static int
digit_p(vret)
    VALUE vret;
{
    char *p;
    int i;

    p = RSTRING(vret)->ptr;
    for (i = 0; i < RSTRING(vret)->len; i++) {
        if (! IS_DIGIT(RSTRING(vret)->ptr[i]))
            return 0;
    }
    return 1;
}
        
static VALUE tok_atom, tok_digit, tok_token, tok_quoted, tok_domlit;
static VALUE tok_from, tok_by, tok_via, tok_with, tok_id, tok_for;

static VALUE
mails_scan(self, arr)
    VALUE self;
{
    struct mails *sc;
    VALUE sret, vret;
    unsigned char *p;
    int debug;

    GET_SCANNER(self, sc);
    if (!sc->p) {
        rb_raise(ScanError, "Mails#scan called before reset");
    }
    Check_Type(arr, T_ARRAY);
    debug = sc->flags & MAILS_DEBUG;

    sret = Qnil;
    vret = Qnil;

    while (1) {
        if (debug) puts("new loop");

        if (sc->p == sc->pend) {
            sret = vret = Qfalse;
            break;
        }
        if (debug) puts("not void");
        
        if (IS_LWSP(*sc->p)) {
            skip_lwsp(sc);
            if (sc->p == sc->pend) {
                sret = vret = Qfalse;
                break;
            }
        }

        if (MIME_MODE_P(sc)) {
            if (IS_TOKENCHAR(*sc->p)) {
                if (debug) puts("TOKEN");
                sret = tok_token;
                vret = scan_token(sc);
                break;
            }
        }
        else {
            if (IS_ATOMCHAR(*sc->p)) {
                if (debug) puts("ATOM");
                sret = tok_atom;
                vret = scan_atom(sc);

                if (RECV_MODE_P(sc)) {
                    p = RSTRING(vret)->ptr;
                    if      (nccmp(p, "from")) sret = tok_from;
                    else if (nccmp(p, "by"))   sret = tok_by;
                    else if (nccmp(p, "via"))  sret = tok_via;
                    else if (nccmp(p, "with")) sret = tok_with;
                    else if (nccmp(p, "id"))   sret = tok_id;
                    else if (nccmp(p, "for"))  sret = tok_for;
                }
                if (digit_p(vret)) sret = tok_digit;
                break;
            }
        }

        if (*sc->p == '"') {
            if (debug) puts("QUOTED");
            sret = tok_quoted;
            vret = scan_quoted(sc);
        }
        else if (*sc->p == '(') {
            VALUE c;
            if (debug) puts("COMMENT");
            c = scan_comment(sc);
            if (! NIL_P(sc->comments))
                rb_ary_push(sc->comments, c);
            continue;
        }
        else if (*sc->p == '[') {
            if (debug) puts("DOMLIT");
            sret = tok_domlit;
            vret = scan_domlit(sc);
        }
        else {
            if (debug) puts("CHAR");
            sret = vret = rb_str_new(sc->p, 1);
            sc->p++;
        }

        break;
    }

    rb_ary_store(arr, 0, sret);
    rb_ary_store(arr, 1, vret);

    return arr;
}


static VALUE
intn(str)
    char *str;
{
    ID tmp;

    tmp = rb_intern(str);
    return INT2FIX(tmp);
}

void
Init_mails()
{
    MailScanner = rb_define_class("MailScanner", rb_cObject);
    rb_define_singleton_method(MailScanner, "new", mails_s_new, 3) ;
    rb_define_method(MailScanner, "scan", mails_scan, 1);
    rb_define_method(MailScanner, "debug", mails_debug_get, 0);
    rb_define_method(MailScanner, "debug=", mails_debug_set, 1);

    ScanError = rb_eval_string("::ScanError");

    tok_atom   = intn("ATOM");
    tok_digit  = intn("DIGIT");
    tok_token  = intn("TOKEN");
    tok_quoted = intn("QUOTED");
    tok_domlit = intn("DOMLIT");

    tok_from = intn("FROM");
    tok_by   = intn("BY");
    tok_via  = intn("VIA");
    tok_with = intn("WITH");
    tok_id   = intn("ID");
    tok_for  = intn("FOR");
}