The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#include "EXTERN.h"
#include "perl.h"
#include "XSUB.h"
#include "perlio.h"
#include "perliol.h"

#include "unicodeeol.h"

IV
PerlIO_UnicodeEOL_pushed(pTHX_ PerlIO *f, const char *mode, SV *arg, PerlIO_funcs *tab)
{
    UnicodeEOL *self = PerlIOSelf(f, UnicodeEOL);
    STDCHAR *p;
    STRLEN len;
    
    if (PerlIOBase(PerlIONext(f))->flags & PERLIO_F_UTF8)
        PerlIOBase(f)->flags |= PERLIO_F_UTF8;
    else
        PerlIOBase(f)->flags &= ~PERLIO_F_UTF8;

    self->previous = 0;

    if (arg && SvPOK(arg) && SvLEN(arg))
        Perl_die(aTHX_ "unicodeeol does not take any arguments");

    return PerlIOBuf_pushed(aTHX_ f, mode, arg, tab);
}

static inline void unshift(PerlIOBuf * buf, const SSize_t count)
{
    STDCHAR *newbuf;
    const SSize_t size = buf->end - buf->ptr;
    Newx(newbuf, size + count, STDCHAR);
    Copy(buf->ptr, newbuf + count, size, STDCHAR);
    Safefree(buf->buf);
    buf->ptr = buf->buf = newbuf;
    buf->bufsiz = size + 1;
    buf->end = buf->buf + size + 1;
}

IV
PerlIO_UnicodeEOL_fill(pTHX_ PerlIO * f)
{
    IV code = PerlIOBuf_fill(aTHX_ f);
    UnicodeEOL *self = PerlIOSelf(f, UnicodeEOL);
    PerlIOBuf *b = PerlIOSelf(f, PerlIOBuf);
    bool last_was_cr = FALSE;
    bool UTF = (PerlIOBase(f)->flags & PERLIO_F_UTF8 ? TRUE : FALSE); /* cBOOL */
    unsigned char *out = b->ptr, *in = b->ptr, *end = b->end;

    if (code) return code;

    /* If the last fill ended on something guaranteed incomplete,
       like the middle of a UTF-8 codepoint, or possibly incomplete,
       specifically a \r *may* be followed by \n, so \r at the end
       of a buffer will immediatly become \n, and if the next buffer
       starts with \n, it should silently be ignored */
    switch (self->previous) {
        case '\r':
            if (*in == '\n')
                out = in = ++b->ptr;
            break;
        case 0xc2:
            if (*in == 0x85) {
                *in = '\n';
            } else {
                const SSize_t size = b->end - b->ptr;
                unshift(b, 1);
                end = b->end;
                out = b->ptr;
                *out++ = 0xc2;
                in = out;
            }
            break;
        case 0x80:
            if (*in == 0xa8 || *in == 0xa9) {
                *in = '\n';
            } else {
                unshift(b, 2);
                end = b->end;
                out = b->ptr;
                *out++ = 0xc2;
                *out++ = 0x80;
                in = out;
            }
            break;
        case 0xe2:
            if (*in == 0x80 && (in+1<end) && (*(in+1) == 0xa8 || *(in+1) == 0xa9)) {
                out = in = b->ptr += 1;
                *out = '\n';
            } else {
                unshift(b, 1);
                end = b->end;
                out = b->ptr;
                *out++ = 0xe2;
                in = out;
            }
            break;
    }
    self->previous = 0;

    while (in != end) {
        bool set_newline = FALSE;
        switch (*in) {
            case 0x0a: 
                if (last_was_cr) out--;
            case 0x0b:
            case 0x0c:
            case 0x0d:
                set_newline = TRUE;
                break;
            case 0xc2: /* 0xc2 0x85 is \u{85}, one of the \R codepoints */
                if (UTF) {
                    if (in + 1 == end) {
                        self->previous = 0xc2;
                        out--; /* Pretend we didn't see this byte */
                    } else {
                        if (*(in + 1) == 0x85) {
                            set_newline = TRUE;
                            in++;
                        } else {
                            *out++ = *in++;
                            *out = *in;
                        }
                    }
                } else {
                    *out = *in;
                }
                break;
            case 0xe2: /* 0xe2 0x80 0xa8 AND 0xe2 0x80 0xa9, \u{2028} and \u{2029} */
                if (UTF) {
                    if (in + 1 == end) {
                        self->previous = 0xe2;
                        b->end = in;
                        return 0;
                    } else if (*(in + 1) == 0x80) {
                        if (in + 2 == end) {
                            self->previous = 0x80;
                            b->end = in;
                            return 0;
                        } else {
                            if (*(in + 2) == 0xa8 || *(in + 2) == 0xa9) {
                                set_newline = TRUE;
                                in += 2;
                            } else {
                                *out++ = *in++;
                                *out++ = *in++;
                                *out = *in;
                            }
                        }
                    } else {
                        *out++ = *in++;
                        *out = *in;
                    }
                } else { /* Not UTF-8 */
                    *out = *in;
                }
            default:
                *out = *in;
                break;
        }
        if (*in == 0x0d)
            last_was_cr = TRUE;
        else
            last_was_cr = FALSE;
        if (set_newline)
            *out = '\n';
        out++, in++;
    }

    if (out != end)
        b->end = out;

    if (last_was_cr)
        self->previous = '\r';

    return 0;
}

PerlIO_funcs PerlIO_unicodeeol = {
    sizeof(PerlIO_funcs),
    "unicodeeol",
    sizeof(UnicodeEOL),
    PERLIO_K_BUFFERED | PERLIO_K_UTF8, 
    PerlIO_UnicodeEOL_pushed,
    PerlIOBuf_popped,
    PerlIOBuf_open,
    PerlIOBase_binmode,
    NULL,
    PerlIOBase_fileno,
    PerlIOBuf_dup,
    PerlIOBuf_read,
    PerlIOBuf_unread,
    PerlIOBuf_write,
    PerlIOBuf_seek,
    PerlIOBuf_tell,
    PerlIOBuf_close,
    PerlIOBuf_flush,
    PerlIO_UnicodeEOL_fill,
    PerlIOBase_eof,
    PerlIOBase_error,
    PerlIOBase_clearerr,
    PerlIOBase_setlinebuf,
    PerlIOBuf_get_base,
    PerlIOBuf_bufsiz,
    PerlIOBuf_get_ptr,
    PerlIOBuf_get_cnt,
    PerlIOBuf_set_ptrcnt
};

/* vim:set ts=8 sts=4 et: */

MODULE = PerlIO::unicodeeol            PACKAGE = PerlIO::unicodeeol

BOOT:  
#ifdef PERLIO_LAYERS
    PerlIO_define_layer(aTHX_ &PerlIO_unicodeeol);
#else
    Perl_die(aTHX_ "PerlIO layers are not supported");
#endif