The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
/* Copyright 2009 Peter Karman
 *
 * This program is free software; you can redistribute it and/or modify
 * under the same terms as Perl itself.
 */

/*
 * Standard XS greeting.
 */
#ifdef __cplusplus
extern "C" {
#endif
#define PERL_NO_GET_CONTEXT 
#include "EXTERN.h"
#include "perl.h"
#include "XSUB.h"
#include "ppport.h"
#ifdef __cplusplus
}
#endif

#ifdef EXTERN
  #undef EXTERN
#endif

#define EXTERN static

/* pure C helpers */
#include "search-tools.c"

/********************************************************************/

MODULE = Search::Tools       PACKAGE = Search::Tools

PROTOTYPES: enable


void
describe(thing)
    SV *thing
    
    CODE:
        st_describe_object(thing);
        st_dump_sv(thing);
        

######################################################################
MODULE = Search::Tools       PACKAGE = Search::Tools::UTF8

PROTOTYPES: enable

int
byte_length(string)
    SV* string;
    
    PREINIT:
        STRLEN len;
        U8 * bytes;
        
    CODE:
        bytes  = (U8*)SvPV(string, len);
        RETVAL = len;
    
    OUTPUT:
        RETVAL


int
is_perl_utf8_string(string)
    SV* string;
    
    PREINIT:
        STRLEN len;
        U8 * bytes;
        
    CODE:
        bytes  = (U8*)SvPV(string, len);
        RETVAL = is_utf8_string(bytes, len);
        
    OUTPUT:
        RETVAL
        
                

SV*
find_bad_utf8(string)
    SV* string;
            
    CODE:
        RETVAL = st_find_bad_utf8(string);

    OUTPUT:
        RETVAL

     
# benchmarks show these XS versions are 9x faster
# than their native Perl regex counterparts
boolean 
is_ascii(string)
    SV* string;
            
    CODE:
        RETVAL = st_is_ascii(string);

    OUTPUT:
        RETVAL


boolean
is_latin1(string)
    SV* string;

    PREINIT:
        STRLEN         len;
        unsigned char* bytes;
        unsigned int   i;

    CODE:
        bytes  = (unsigned char*)SvPV(string, len);
        RETVAL = 1;
        for(i=0; i < len; i++) {
            if (bytes[i] > 0x7f && bytes[i] < 0xa0) {
                RETVAL = 0;
                break;
            }
        }

    OUTPUT:
        RETVAL


void
debug_bytes(string)
    SV* string;

    PREINIT:
        STRLEN         len;
        unsigned char* bytes;
        unsigned int   i;

    CODE:
        bytes  = (unsigned char*)SvPV(string, len);
        for(i=0; i < len; i++) {
            warn("'%c' \\x%x \\%d\n", bytes[i], bytes[i], bytes[i]);
        }


IV
find_bad_ascii(string)
    SV* string;
    
    PREINIT:
        STRLEN          len;
        unsigned char*  bytes;
        int             i;
        
    CODE:
        bytes  = (unsigned char*)SvPV(string, len);
        RETVAL = -1;
        for(i=0; i < len; i++) {
            if (bytes[i] >= 0x80) {
                RETVAL = i;
                break;
            }  
        }

    OUTPUT:
        RETVAL

int
find_bad_latin1(string)
    SV* string;

    PREINIT:
        STRLEN          len;
        unsigned char*  bytes;
        int             i;

    CODE:
        bytes  = (unsigned char*)SvPV(string, len);
        RETVAL = -1;
        for(i=0; i < len; i++) {
            if (bytes[i] > 0x7f && bytes[i] < 0xa0) {
                RETVAL = i;
                break;
            }
        }

    OUTPUT:
        RETVAL



#############################################################################

MODULE = Search::Tools       PACKAGE = Search::Tools::Tokenizer

PROTOTYPES: enable

SV*
tokenize(self, str, ...)
    SV* self;
    SV* str;
    
    PREINIT:
        SV* token_re;
        SV* token_list_sv;
        STRLEN len;
        U8* bytes;
        SV* heat_seeker = NULL;
        IV match_num;
        
    CODE:
        if (items > 2) {
            heat_seeker = ST(2);
        }
        match_num = 0;
        if (items > 3) {
            match_num = SvIV(ST(3));
        }
        
        /* test if utf8 flag on and make sure it is.
         * otherwise, regex for \w can fail for multibyte chars.
         * we do a slight (~7%) optimization for ascii str because
         * the regex engine is faster for all-ascii texts.
         * the logic is: 
         *  if the flag is on, ok.
         *  else, 
         *      if the string is ascii, ok for flag to be off,
         *      but we don't turn it off. 
         *      if the string is NOT ascii, make sure it is utf8
         *      and turn the flag on. 
         */
        if (!SvUTF8(str)) {
            if (!st_is_ascii(str)) {
                bytes  = (U8*)SvPV(str, len);
                if(!is_utf8_string(bytes, len)) {
                    croak(ST_BAD_UTF8);
                }
                SvUTF8_on(str);
            }
        }

        token_re = st_hvref_fetch(self, "re");
        token_list_sv = st_tokenize(str, token_re, heat_seeker, match_num);
        RETVAL = token_list_sv;
    
    OUTPUT:
        RETVAL

SV*
set_debug(self, val)
    SV* self;
    boolean val;
    
    CODE:
        SV* st_debug_var;
        st_debug_var = get_sv("Search::Tools::XS_DEBUG", GV_ADD);
        //warn(" st_debug_var before = '%s'\n", SvPV_nolen(st_debug_var));
        SvIV_set(st_debug_var, val);
        SvIOK_on(st_debug_var);
        //warn("ST_DEBUG set to %d", val);
        //warn(" st_debug_var set = '%d'\n", ST_DEBUG);
        if (SvREFCNT(st_debug_var) == 1) {
            // IMPORTANT because we access var from Perl and C
            SvREFCNT_inc(st_debug_var);
        }
        RETVAL = st_debug_var;
    
    OUTPUT:
        RETVAL


SV*
get_offsets(self, str, regex)
    SV* self;
    SV* str;
    SV* regex;
    
    CODE:
        RETVAL = newRV_noinc((SV*)st_heat_seeker_offsets(str, regex));
    
    OUTPUT:
        RETVAL



############################################################################

MODULE = Search::Tools       PACKAGE = Search::Tools::TokenList

PROTOTYPES: enable

void
dump(self)
    st_token_list *self;
    
    CODE:
        st_dump_token_list(self);


SV*
next(self)
    st_token_list *self;
   
    PREINIT:
        IV len;
        
    CODE:
        len = av_len(self->tokens);
        //warn("len = %d and pos = %d", len, self->pos);
        
        if (len == -1) {
            // empty list
            RETVAL = &PL_sv_undef;
        }
        else if (self->pos > len) {
            // exceeded end of list
            RETVAL = &PL_sv_undef;
        }
        else {
            if (!av_exists(self->tokens, self->pos)) {
                ST_CROAK("no such index at %d", self->pos);
            }
            //st_dump_sv( st_av_fetch(self->tokens, self->pos) );
            RETVAL = SvREFCNT_inc(st_av_fetch(self->tokens, self->pos++));
            
        }
        
            
    OUTPUT:
        RETVAL


SV*
prev(self)
    st_token_list *self;
   
    PREINIT:
        IV len;
        
    CODE:
        len = av_len(self->tokens);
        if (len == -1) {
            // empty list
            RETVAL = &PL_sv_undef;
        }
        else if (self->pos < 0) {
            // exceeded start of list
            RETVAL = &PL_sv_undef;
        }
        else {
            if (!av_exists(self->tokens, (self->pos-1))) {
                ST_CROAK("no such index at %d", (self->pos-1));
            }
            RETVAL = SvREFCNT_inc(st_av_fetch(self->tokens, --(self->pos)));
        }
        
            
    OUTPUT:
        RETVAL


SV*
get_token(self, pos)
    st_token_list *self;
    IV pos;
    
    CODE:
        if (!av_exists(self->tokens, pos)) {
            RETVAL = &PL_sv_undef;
        }
        else {
            RETVAL = SvREFCNT_inc(st_av_fetch(self->tokens, pos));
        }
    
    OUTPUT:
        RETVAL


IV
set_pos(self, new_pos)
    st_token_list *self;
    IV  new_pos;
            
    CODE:
        RETVAL = self->pos;
        self->pos = new_pos;
       
    OUTPUT:
        RETVAL


IV
reset(self)
    st_token_list *self;
        
    CODE:
        RETVAL = self->pos;
        self->pos = 0;
    
    OUTPUT:
        RETVAL
 

IV
len(self)
    st_token_list *self;
    
    CODE:
        RETVAL = av_len(self->tokens) + 1;
        
    OUTPUT:
        RETVAL


IV
num(self)
    st_token_list *self;
    
    CODE:
        RETVAL = self->num;
    
    OUTPUT:
        RETVAL


IV
pos(self)
    st_token_list *self;
    
    CODE:
        RETVAL = self->pos;
    
    OUTPUT:
        RETVAL


SV*
as_array(self)
    st_token_list *self;
    
    CODE:
        RETVAL = newRV_inc((SV*)self->tokens);
    
    OUTPUT:
        RETVAL
        

SV*
get_heat(self)
    st_token_list *self;
    
    PREINIT:
        AV *heat;
        IV len;
        IV pos;
        SV* h;
    
    CODE:
        heat = newAV();
        pos = 0;
        len = av_len(self->heat)+1;
        while (pos < len) {
            h = st_av_fetch(self->heat, pos++);
            av_push(heat, h);
        }
        RETVAL = newRV((SV*)heat);    /* no _inc -- this is a copy */
    
    OUTPUT:
        RETVAL


SV*
get_sentence_starts(self)
    st_token_list *self;
    
    PREINIT:
        AV *starts;
        IV len;
        IV pos;
        SV* sstart;
    
    CODE:
        starts = newAV();
        pos = 0;
        len = av_len(self->sentence_starts)+1;
        while (pos < len) {
            sstart = st_av_fetch(self->sentence_starts, pos++);
            av_push(starts, sstart);
        }
        RETVAL = newRV((SV*)starts);    /* no _inc -- this is a copy */
    
    OUTPUT:
        RETVAL


SV*
matches(self)
    st_token_list *self;
    
    PREINIT:
        AV *matches;
        IV pos;
        IV len;
        SV* tok;
        st_token *token;
    
    CODE:
        matches = newAV();
        pos = 0;
        len = av_len(self->tokens)+1;
        while (pos < len) {
            tok = st_av_fetch(self->tokens, pos++);
            token = (st_token*)st_extract_ptr(tok);
            if (token->is_match) {
                av_push(matches, tok);
            }
        }
        RETVAL = newRV((SV*)matches); /* no _inc -- this is only copy */
    
    OUTPUT:
        RETVAL


IV
num_matches(self)
    st_token_list *self;
    
    PREINIT:
        IV pos;
        IV len;
        IV num_matches;
        st_token *token;
    
    CODE:
        num_matches = 0;
        pos = 0;
        len = av_len(self->tokens)+1;
        while (pos < len) {
            token = (st_token*)st_av_fetch_ptr(self->tokens, pos++);
            if (token->is_match) {
                num_matches++;
            }
        }
        RETVAL = num_matches;
    
    OUTPUT:
        RETVAL


void
DESTROY(self)
    SV *self;
    
    PREINIT:
        st_token_list *tl;
        
    CODE:
        
        
        tl = (st_token_list*)st_extract_ptr(self);
        tl->ref_cnt--;
        if (ST_DEBUG) {
            warn("............................");
            warn("DESTROY %s [%ld] [0x%lx]\n", 
                SvPV_nolen(self), (unsigned long)tl->ref_cnt, (unsigned long)tl);
            st_describe_object(self);
            st_dump_sv((SV*)tl->tokens);
        }
        if (tl->ref_cnt < 1) {
            st_free_token_list(tl);
        }



############################################################################

MODULE = Search::Tools       PACKAGE = Search::Tools::Token

PROTOTYPES: enable

IV
pos(self)
    st_token *self;
    
    CODE:
        RETVAL = self->pos;
    
    OUTPUT:
        RETVAL


SV*
str(self)
    st_token *self;
            
    CODE:
        RETVAL = SvREFCNT_inc(self->str);

    OUTPUT:
        RETVAL


IV
len(self)
    st_token *self;
    
    CODE:
        RETVAL = self->len;
    
    OUTPUT:
        RETVAL


IV
u8len(self)
    st_token *self;
    
    CODE:
        RETVAL = self->u8len;
    
    OUTPUT:
        RETVAL


IV
is_hot(self)
    st_token *self;
    
    CODE:
        RETVAL = self->is_hot;
    
    OUTPUT:
        RETVAL


IV
is_match(self)
    st_token *self;
    
    CODE:
        RETVAL = self->is_match;
    
    OUTPUT:
        RETVAL


IV
is_sentence_start(self)
    st_token *self;
    
    CODE:
        RETVAL = self->is_sentence_start;
    
    OUTPUT:
        RETVAL


IV
is_sentence_end(self)
    st_token *self;
    
    CODE:
        RETVAL = self->is_sentence_end;
    
    OUTPUT:
        RETVAL

IV
is_abbreviation(self)
    st_token *self;
    
    CODE:
        RETVAL = self->is_abbreviation;
    
    OUTPUT:
        RETVAL
        
IV
set_match(self, val)
    st_token *self;
    IV val;
    
    CODE:
        RETVAL = self->is_match;
        self->is_match = val;
    
    OUTPUT:
        RETVAL


IV
set_hot(self, val)
    st_token *self;
    IV val;
    
    CODE:
        RETVAL = self->is_hot;
        self->is_hot = val;
    
    OUTPUT:
        RETVAL


void
dump(self)
    st_token *self;
    
    CODE:
        st_dump_token(self);


void
DESTROY(self)
    SV *self;
    
    PREINIT:
        st_token *tok;
        
    CODE:
        tok = (st_token*)st_extract_ptr(self);
        tok->ref_cnt--;
        if (ST_DEBUG) {
            warn("............................");
            warn("DESTROY %s [%ld] [0x%lx]\n", 
                SvPV_nolen(self), (unsigned long)tok->ref_cnt, (unsigned long)tok);
        }
        if (tok->ref_cnt < 1) {
            st_free_token(tok);
        }
    

############################################################################

MODULE = Search::Tools       PACKAGE = Search::Tools::XML

PROTOTYPES: enable

SV*
_escape_xml(text, is_flagged_utf8)
    char *text;
    int   is_flagged_utf8;

    CODE:
        RETVAL = st_escape_xml(text);
        if (is_flagged_utf8) {
            SvUTF8_on(RETVAL);
        }
    
    OUTPUT:
        RETVAL