The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.

#include "EXTERN.h"
#include "perl.h"
#include "XSUB.h"
#include <ctype.h>

#define TRACE_INTERNAL  7  /* synchronize this with the %trace_levels */
#define NO_TRACE        6  /*   in Mail::Reporter                     */
#define TRACE_ERRORS    5
#define TRACE_WARNINGS  4
#define TRACE_PROGRESS  3
#define TRACE_NOTICES   2
#define TRACE_DEBUG     1

#ifndef MAX_LINE
#define MAX_LINE        1024
#endif

#ifndef NULL
#define NULL
#endif

#ifndef EOL
#define EOL  '\0'
#endif

#ifndef CR
#define CR   '\015'
#endif

#ifndef LF
#define LF   '\012'
#endif

#define MAX_FOLD       512
#define FOLDSTART      " "
#define COPYSIZE       4096

typedef struct separator
{   char             * line;
    int                length;
    struct separator * next;
} Separator;

typedef struct
{   char      * filename;
    FILE      * file;
    Separator * separators;

    int         trace;
    int         dosmode;

    int         strip_gt;
    int         keep_line;         /* unget line */

    char        line[MAX_LINE+2];  /* one more for missing newline on *
                                    * last line of file on Windows    */
    long        line_start;
} Mailbox;

static Mailbox ** boxes = NULL;
static int nr_boxes     = 0;

/*
 * new_mailbox
 */

Mailbox * new_mailbox(char *filename)
{   Mailbox * box;

    New(0, box, 1, Mailbox);
    box->keep_line    = 0;
    box->strip_gt     = 0;
    box->dosmode      = 1;  /* will be set to 0 if not true */
    box->separators   = NULL;

    /* Copy the filename. */
    New(0, box->filename, strlen(filename)+1, char);
    strcpy(box->filename, filename);

    return box;
}

/*
 * take_box_slot
 */

static int take_box_slot(Mailbox *new)
{   int boxnr;

    if(boxes==NULL)
    {   nr_boxes = 10;
        Newz(0, boxes, nr_boxes, Mailbox *);
        boxnr    = 0;
    }
    else
    {   for(boxnr = 0; boxnr < nr_boxes; boxnr++)
            if(boxes[boxnr]==NULL) break;

        if(boxnr >= nr_boxes)
        {   /* Add 10 more slots for MailBoxes. */
            int i;
            Renew(boxes, nr_boxes + 10, Mailbox *);

            for(i=0; i<10; i++)
                 boxes[nr_boxes++] = NULL;
        }
    }

    /*fprintf(stderr, "Occupy slot %d\n", boxnr);*/
    boxes[boxnr] = new;
    return boxnr;
}

/*
 * free_box_slot
 */

static void free_box_slot(int boxnr)
{
    if(boxnr >= 0 && boxnr < nr_boxes)   /* bit careful */
        boxes[boxnr] = NULL;
}

/*
 * get_box
 */

static Mailbox *get_box(int boxnr)
{
    if(boxnr < 0 || boxnr >= nr_boxes) return NULL;
    return boxes[boxnr];
}

/*
 * get_one_line
 */

static char * get_one_line(Mailbox *box)
{
    if(box->keep_line)
    {   box->keep_line = 0;
        return box->line;
    }

    box->line_start = (long)ftell(box->file);
    if(!fgets(box->line, MAX_LINE, box->file))
        return NULL;

    if(box->dosmode)
    {   int len = strlen(box->line);
        if(len >= 2 && box->line[len-2]==CR)
        {   box->line[len-2] = '\n';         /* Remove CR's before LF's      */
            box->line[len-1] = EOL;
        }
        else
        if(len==0 || box->line[len-1]!='\n') /* Last line on Win* may lack   */
        {   box->line[len]   = '\n';         /*   newline.  Add it silently  */
            box->line[len+1] = EOL;
        }
        else box->dosmode = 0;               /* Apparently not dosmode at all*/
    }

    return box->line;
}

/*
 * file_position
 * Give the file-position of the line to be processed.
 */

static long file_position(Mailbox *box)
{   return box->keep_line ? box->line_start : (long)ftell(box->file);
}

/*
 * goto_position
 * Jump to a different place in the file.
 */

static int goto_position(Mailbox *box, long where)
{   box->keep_line = 0;
    return fseek(box->file, where, 0);
}

/*
 * read_header_line
 */

static int read_header_line(Mailbox *box, SV **field, SV **content)
{
    char * line;
    char * reader;
    int    length, field_error;

    line   = get_one_line(box);
    if(line==NULL)    return 0;  /* end of file.          */
    if(line[0]=='\n') return 0;  /* normal end of header. */

    /*
     * Read the header's field.
     */

    for(reader = line; *reader!=':' && *reader!='\n'; reader++)
        ;

    if(*reader=='\n')
    {   fprintf(stderr, "Unexpected end of header (C parser):\n  %s", line);
        box->keep_line = 1;
        return 0;
    }

    field_error = 0;
    for(length=reader-line-1; length >= 0 && isspace(line[length]); --length)
        field_error++;

    if(field_error && box->trace <= TRACE_WARNINGS)
    {   fprintf(stderr, "Blanks stripped after header-fieldname:\n  %s",line);
    }

    *field = newSVpvn(line, length+1);

    /*
     * Now read the content.
     */

    /* skip leading blanks. */
    for(++reader; isspace(*reader); ++reader)
        ;
    *content = newSVpv(reader, 0);

    /*
     * Add folded lines.
     */

    while(1)
    {   line = get_one_line(box);
        if(line==NULL) break;

        if( !isspace(line[0]) || line[0]=='\n')
        {   box->keep_line = 1;
            break;
        }
        sv_catpv(*content, line);
    }

    return 1;
}

/*
 * is_good_end
 * Look if the predicted size of the message may be real.  Real means
 * that after the given location is end-of-file, or some blank lines
 * and then the active separator.
 *
 * This function returns whether this seems the right end.
 */

static int is_good_end(Mailbox *box, long where)
{   char      *line;
    int        found;
    Separator *sep;
    long       old_location;

    sep   = box->separators;
    if(sep==NULL) return 1;       /* no seps, than we have to trust it. */

    old_location   = file_position(box);
    if(where >= 0)
    {   if(goto_position(box, where)!=0)
        {   /* File too short. */
            goto_position(box, old_location);
            return 0;             /* Impossible seek. */
        }
        box->keep_line = 0;       /* carefully destroy unget-line. */
    }

    line = get_one_line(box);     /* find first non-empty line. */
    while(line!=NULL && line[0]=='\n' && line[1]==EOL)
        line = get_one_line(box);

    found = (line==NULL || strncmp(line, sep->line, sep->length)==0);

    goto_position(box, old_location);
    return found;
}

/*
 * skip_empty_lines
 */

static void skip_empty_lines(Mailbox *box)
{   char * line;

    while(1)
    {   line = get_one_line(box);

        if(line==NULL)
            break;

        if(line[0]!='\n')
        {   box->keep_line = 1;
            break;
        }
    }
}

/*
 * read_stripped_lines
 * In dosmode, each line must be stripped from the \r, and
 * when we have the From-line seperator, /^>+From / must be stripped
 * from one >.
 *
 * Reading from a Windows file will translate \r\n into \n.  But it
 * is hard to find-out if this is the case.  However, the Content-Length
 * field count these line-seps both.  That's why the ftell() is asked
 * to provide the real location.
 */

static int is_separator(Separator *sep, char *line)
{  
   if(strncmp(sep->line, line, sep->length)!=0) return 0;

   if(strcmp(sep->line, "From ") !=0) return 1;

   /* From separators shall contain a year in the line */
   while(*line)
   {   if(   (line[0]=='1' || line[0]=='2')
          && isdigit(line[1]) && isdigit(line[2]) && isdigit(line[3])
         ) return 1;

       line++;
   }

   return 0;
}

static char **read_stripped_lines(Mailbox *box,
    int expect_chars, int expect_lines,
    int *nr_chars,    int *nr_lines)
{   char   ** lines      = NULL;
    int       max_lines;
    long      start      = file_position(box);
    int       last_blank = 0;
    long      last_position;

    last_position = start;
    max_lines     = expect_lines >= 0 ? (expect_lines+10) : 1000;

    /*fprintf(stderr, "maxlines %ld\n", (long)max_lines);*/
    New(0, lines, max_lines, char *);
    *nr_lines = 0;
    *nr_chars = 0;

    while(1)
    {   char      *line;
        char      *linecopy;
        Separator *sep;
        int        length;

        if(*nr_lines == expect_lines && is_good_end(box, -1))
            break;

        if(file_position(box)-start == expect_chars && is_good_end(box,-1))
            break;

        line = get_one_line(box);
        if(line==NULL)   /* remove empty line before eof.*/
        {   if(last_blank && box->separators)
            {   Safefree( lines[ --(*nr_lines) ] );
                (*nr_chars)--;
                goto_position(box, last_position);
                last_blank = 0;
            }
            break;
        }

        /*
         * Check for separator
         */

        sep = box->separators;
        while(sep != NULL && !is_separator(sep, line))
            sep = sep->next;

        if(sep!=NULL)
        {   /* Separator found */
            box->keep_line = 1;      /* keep separator line to read later.  */

            if(last_blank)           /* Remove blank line before separator. */
            {   Safefree( lines[ --(*nr_lines) ] );
                (*nr_chars)--;
                goto_position(box, last_position);
                last_blank = 0;
            }

            break;
        }

        /*
         *   >>>>From becomes >>>From
         */

        if(box->strip_gt && line[0]=='>')
        {   char *reader = line;
            while(*reader == '>') reader++;
            if(strncmp(reader, "From ", 5)==0)
               line++;
        }

        /*
         * Store line
         */

        if(*nr_lines >= max_lines)
        {   max_lines = max_lines + max_lines/2;
            lines = Renew(lines, max_lines, char *);
        }

        length           = strlen(line);
        last_blank       = length==1;
        last_position    = box->line_start;

        New(0, linecopy, length+1, char);
        strcpy(linecopy, line);

        lines[*nr_lines] = linecopy;

        (*nr_lines)++;
        *nr_chars       += length;
    }

    return lines;
}

/*
 * scan_stripped_lines
 * Like read_stripped_lines, but then without allocation memory.
 */

static int scan_stripped_lines(Mailbox *box,
    int expect_chars, int expect_lines,
    int *nr_chars,    int *nr_lines)
{   long      start          = file_position(box);
    long      last_position;
    int       last_blank     = 0;

    *nr_lines     = 0;
    *nr_chars     = 0;
    last_position = start;

    while(1)
    {   char      *line;
        Separator *sep;
	int        length;

        if(*nr_lines == expect_lines && is_good_end(box, -1))
            break;

        if(file_position(box)-start == expect_chars && is_good_end(box,-1))
            break;

        line = get_one_line(box);
        if(line==NULL)
        {   /* remove empty line before eof if separator.*/
            if(last_blank && box->separators)
            {   (*nr_lines)--;
                (*nr_chars)--;
                goto_position(box, last_position);
                last_blank = 0;
            }
            break;
        }

        /*
         * Check for separator
         */

        sep = box->separators;
        while(sep != NULL && !is_separator(sep, line))
            sep = sep->next;

        if(sep!=NULL)
        {   /* Separator found */
            box->keep_line = 1;  /* keep separator line to read later  */
            if(last_blank)       /* remove empty line before separator */
            {   (*nr_lines)--;
                (*nr_chars)--;
                goto_position(box, last_position);
                last_blank = 0;
            }
            break;
        }

        /*
         *   >>>>From becomes >>>From
         */

        if(box->strip_gt && line[0]=='>')
        {   char *reader = line;
            while(*reader == '>') reader++;
            if(strncmp(reader, "From ", 5)==0)
               line++;
        }

        /*
         * Count
         */

        (*nr_lines)++;
        length        = strlen(line);
        *nr_chars    += length;
        last_blank    = length==1;
        last_position = box->line_start;
    }

/**hier**/
/*fprintf(stderr, "Scanning done\n");*/
    return 1;
}

/*
 * take_scalar
 * Take a block of file-data into one scalar, as efficient as possible.
 */

static SV* take_scalar(Mailbox *box, long begin, long end)
{
    char     buffer[COPYSIZE];
    size_t   tocopy = end - begin;
    size_t   bytes  = 1;
    SV      *result = newSVpv("", 0);

    /* pre-grow the scalar, so Perl doesn't need to re-alloc */
    SvGROW(result, tocopy);

    goto_position(box, begin);
    while(tocopy > 0 && bytes > 0)
    {   int take = tocopy < COPYSIZE ? tocopy : COPYSIZE;
        bytes    = fread(buffer, take, 1, box->file);
        sv_catpvn(result, buffer, bytes);
        tocopy  -= bytes;
    }

    return result;
}

/***
 *** HERE XS STARTS
 ***/

MODULE = Mail::Box::Parser::C PACKAGE = Mail::Box::Parser::C  PREFIX = MBPC_

PROTOTYPES: ENABLE

#
# open_filename
#

int
MBPC_open_filename(char *name, char *mode, int trace)

  PREINIT:
    Mailbox * box;
    int       boxnr;
    FILE    * file;

  CODE:

    /* Open the file. */
    file = fopen(name, mode);
    if(file==NULL)
    {   /*fprintf(stderr, "Unable to open file %s for %s.\n", name, mode);*/
        XSRETURN_UNDEF;
    }

    box       = new_mailbox(name);
    box->file = file;

    boxnr     = take_box_slot(box);

    /*fprintf(stderr, "Open is done.\n");*/
    RETVAL    = boxnr;

  OUTPUT:
    RETVAL

#
# open_filehandle
#

int
MBPC_open_filehandle(FILE *fh, char *name, int trace)

  PREINIT:
    Mailbox * box;
    int       boxnr;

  CODE:
    box       = new_mailbox(name);
    box->file = fh;

    boxnr     = take_box_slot(box);

    /*fprintf(stderr, "Open with filehande is done.\n");*/
    RETVAL    = boxnr;

  OUTPUT:
    RETVAL


#
# close_file
#

void
MBPC_close_file(int boxnr)

  PREINIT:
    Mailbox   * box;
    Separator * sep;

  CODE:
    box  = get_box(boxnr);
    if(box==NULL) return;

    free_box_slot(boxnr);

    if(box->file != NULL)
    {   fclose(box->file);
        box->file = NULL;
    }

    sep = box->separators;
    while(sep!=NULL)
    {   Separator * next = sep->next;
        Safefree(sep->line);
        Safefree(sep);
        sep = next;
    }

    Safefree(box->filename);
    Safefree(box);


#
# push_separator
#

void
MBPC_push_separator(int boxnr, char *line_start)

  PREINIT:
    Mailbox *box;
    Separator  *sep;

  PPCODE:
    box  = get_box(boxnr);
    if(box==NULL) return;

    /*fprintf(stderr, "separator\n");*/
    New(0, sep, 1, Separator);
    sep->length     = strlen(line_start);

    /*fprintf(stderr, "separator %ld\n", (long)sep->length+1);*/
    New(0, sep->line, sep->length+1, char);
    strcpy(sep->line, line_start);

    sep->next       = box->separators;
    box->separators = sep;

    if(strncmp(sep->line, "From ", sep->length)==0)
        box->strip_gt++;


#
# pop_separator
#

SV *
MBPC_pop_separator(int boxnr)

  PREINIT:
    Mailbox   *box;
    Separator *old;

  CODE:
    box  = get_box(boxnr);
    if(box==NULL) XSRETURN_UNDEF;

    old = box->separators;
    if(old==NULL) XSRETURN_UNDEF;

    if(strncmp(old->line, "From ", old->length)==0)
        box->strip_gt--;

    box->separators = old->next;
    RETVAL = newSVpv(old->line, old->length);

    Safefree(old->line);
    Safefree(old);

  OUTPUT:
    RETVAL


#
# get_position
#

long
MBPC_get_position(int boxnr)

  PREINIT:
    Mailbox *box;

  CODE:
    box  = get_box(boxnr);
    if(box==NULL) RETVAL = 0;
    else          RETVAL = file_position(box);

  OUTPUT:
    RETVAL

#
# set_position
#

int
MBPC_set_position(int boxnr, long where)

  PREINIT:
    Mailbox *box;

  CODE:
    box  = get_box(boxnr);

    if(box==NULL) RETVAL = 0;
    else          RETVAL = goto_position(box, where)==0;

  OUTPUT:
    RETVAL

#
# read_header
# Returns (begin, end, list-of-fields)
# Where
#     begin and end represent file-locations before resp after the header
#     each field is a ref to an array with a name/content pair, representing
#          one line.
#

void
MBPC_read_header(int boxnr)

  PREINIT:
    Mailbox * box;
    SV      * name;
    SV      * content;
    SV      * end;

  PPCODE:
    box = get_box(boxnr);
    if(box==NULL || box->file==NULL) return;

    XPUSHs(sv_2mortal(newSViv((IV)file_position(box))));
    XPUSHs(end = sv_newmortal());

    while(read_header_line(box, &name, &content))
    {   AV * field = newAV();
        av_push(field, name);     /* av_push does not increase refcount */
        av_push(field, content);
        XPUSHs(sv_2mortal(newRV_noinc((SV *)field)));
    }
 
    /*fprintf(stderr, "Header has been read\n");*/
    sv_setiv(end, (IV)file_position(box));

#
# in_dosmode
#

int
MBPC_in_dosmode(int boxnr)

  PREINIT:
    Mailbox *box;

  CODE:
    box   = get_box(boxnr);
    if(box==NULL)
        XSRETURN_UNDEF;

    RETVAL = box->dosmode;

  OUTPUT:
    RETVAL


#
# read_separator
# Return a line with the last defined separator.  Empty lines before this
# are permitted, but no other lines.
#

void
MBPC_read_separator(int boxnr)

  PREINIT:
    Mailbox   *box;
    Separator *sep;
    char      *line;

  PPCODE:
    box  = get_box(boxnr);
    if(box==NULL)
        XSRETURN_EMPTY;

    sep  = box->separators;    /* Never success when there is no sep */
    if(sep==NULL)
        XSRETURN_EMPTY;

    line = get_one_line(box);  /* Get first real line. */
    while(line!=NULL && line[0]=='\n' && line[1]==EOL)
        line = get_one_line(box);

    if(line==NULL)             /* EOF reached. */
        XSRETURN_EMPTY;

    if(strncmp(sep->line, line, sep->length)!=0)
    {   box->keep_line = 1;
        return;
    }

    EXTEND(SP, 2);
    PUSHs(sv_2mortal(newSViv(box->line_start)));
    PUSHs(sv_2mortal(newSVpv(line, strlen(line))));


#
# body_as_string
# Read the whole body into one scalar, and return it.
# When lines need a post-processing, we read line-by-line.  Otherwise
# we can read the block as a whole.
#

void
MBPC_body_as_string(int boxnr, int expect_chars, int expect_lines)

  PREINIT:
    Mailbox *box;
    SV      *result;
    char   **lines;
    int      nr_lines = 0;
    int      nr_chars = 0;
    int      line_nr;
    long     begin;

  PPCODE:
    box  = get_box(boxnr);
    if(box==NULL)
        XSRETURN_EMPTY;

    begin = file_position(box);

    if(!box->dosmode && !box->strip_gt && expect_chars >=0)
    {
        long  end = begin + expect_chars;

        if(is_good_end(box, end))
        {   EXTEND(SP, 3);
            PUSHs(sv_2mortal(newSViv(begin)));
            PUSHs(sv_2mortal(newSViv(file_position(box))));
            PUSHs(sv_2mortal(take_scalar(box, begin, end)));
            XSRETURN(3);
        }
    }

    lines = read_stripped_lines(box, expect_chars, expect_lines,
        &nr_chars, &nr_lines);

    if(lines==NULL)
        XSRETURN_EMPTY;

    /* Join the strings. */
    result = newSVpv("",0);
    SvGROW(result, (unsigned int)nr_chars);

    for(line_nr=0; line_nr<nr_lines; line_nr++)
    {   sv_catpv(result, lines[line_nr]);
        Safefree(lines[line_nr]);
    }

    skip_empty_lines(box);
    Safefree(lines);

    EXTEND(SP, 3);
    PUSHs(sv_2mortal(newSViv(begin)));
    PUSHs(sv_2mortal(newSViv(file_position(box))));
    PUSHs(sv_2mortal(result));


#
# body_as_list
# Read the whole body into a list of scalars.
#

void
MBPC_body_as_list(int boxnr, int expect_chars, int expect_lines)

  PREINIT:
    Mailbox *box;
    char   **lines;
    int      nr_lines = 0;
    int      nr_chars = 0;
    int      line_nr;
    long     begin;
    AV     * results;

  PPCODE:
    box   = get_box(boxnr);
    if(box==NULL)
        XSRETURN_EMPTY;

    begin = file_position(box);
    lines = read_stripped_lines(box, expect_chars, expect_lines,
        &nr_chars, &nr_lines);

    if(lines==NULL) return;

    XPUSHs(sv_2mortal(newSViv(begin)));
    XPUSHs(sv_2mortal(newSViv(file_position(box))));

    /* Allocating the lines for real. */

    results = (AV *)sv_2mortal((SV *)newAV());
    av_extend(results, nr_lines);

    for(line_nr=0; line_nr<nr_lines; line_nr++)
    {   char *line = lines[line_nr];
        av_push(results, newSVpv(line, 0));
        Safefree(line);
    }

    XPUSHs(sv_2mortal(newRV((SV *)results)));

    skip_empty_lines(box);
    Safefree(lines);


#
# body_as_file
# Read the whole body into a file.
#

void
MBPC_body_as_file(int boxnr, FILE *out, int expect_chars, int expect_lines)

  PREINIT:
    Mailbox *box;
    char   **lines;
    int      nr_lines=0;
    int      nr_chars=0;
    int      line_nr;
    long     begin;

  PPCODE:
    box    = get_box(boxnr);
    if(box==NULL)
        XSRETURN_EMPTY;

    begin = file_position(box);
    lines = read_stripped_lines(box, expect_chars, expect_lines,
        &nr_chars, &nr_lines);

    if(lines==NULL)
        XSRETURN_EMPTY;

    EXTEND(SP, 3);
    PUSHs(sv_2mortal(newSViv((IV)begin)));
    PUSHs(sv_2mortal(newSViv((IV)file_position(box))));
    PUSHs(sv_2mortal(newSViv((IV)nr_lines)));

    /* write the lines to file. */

    for(line_nr=0; line_nr<nr_lines; line_nr++)
    {   fprintf(out, "%s", lines[line_nr]);
        Safefree(lines[line_nr]);
    }

    skip_empty_lines(box);
    Safefree(lines);


#
# body_delayed
# Skip the whole body, only counting chars and lines.
#

void
MBPC_body_delayed(int boxnr, int expect_chars, int expect_lines)

  PREINIT:
    Mailbox *box;
    int      nr_lines = 0;
    int      nr_chars = 0;
    long     begin;

  PPCODE:
    box   = get_box(boxnr);
    if(box==NULL)
        XSRETURN_EMPTY;

    begin = file_position(box);

    if(expect_chars >=0)
    {
        long  end    = begin + expect_chars;
        if(is_good_end(box, end))
        {   /*  Accept new end  */
            goto_position(box, end);

            EXTEND(SP, 4);
            PUSHs(sv_2mortal(newSViv((IV)begin)));
            PUSHs(sv_2mortal(newSViv((IV)end)));
            PUSHs(sv_2mortal(newSViv((IV)expect_chars)));
            PUSHs(sv_2mortal(newSViv((IV)expect_lines)));
            skip_empty_lines(box);
            XSRETURN(4);
        }
    }

    if(scan_stripped_lines(box, expect_chars, expect_lines,
        &nr_chars, &nr_lines))
    {   EXTEND(SP, 4);
        PUSHs(sv_2mortal(newSViv((IV)begin)));
        PUSHs(sv_2mortal(newSViv((IV)file_position(box))));
        PUSHs(sv_2mortal(newSViv((IV)nr_chars)));
        PUSHs(sv_2mortal(newSViv((IV)nr_lines)));
        skip_empty_lines(box);
    }

#
# get_filehandle
#

FILE *
MBPC_get_filehandle(int boxnr)

  PREINIT:
    Mailbox * box;

  CODE:
    box       = get_box(boxnr);
    if(box==NULL) XSRETURN_UNDEF;

    RETVAL    = box->file;

  OUTPUT:
    RETVAL