The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.

NAME

SimpleR::Reshape 数据处理转换

接口山寨自R语言的read.table/write.table/merge,还有reshape2dplyr

FUNCTION

实例参考xt子文件夹

read_table

支持 从文件或arrayref 按行读入数据,转换后输出新的 文件或arrayref

    my $r = read_table(
        'reshape_src.csv', 
        skip_head=>1, 
        conv_sub => sub { [ "$_[0][0] $_[0][1]", $_[0][2], $_[0][3] ] }, 

        write_file => '01.read_table.csv', 
        #skip_sub => sub { $_[0][3]<200 }, 
        #return_arrayref => 1, 
        #write_head => [ "head_a", "key" , "value" ], 
        #sep=>',', 
        #charset=>'utf8', 
    );

write_table

将指定数据写入文本文件

    my $d = [ [qw/a b 1/], [qw/c d 2/] ]; 
    write_table($d, 
        file=> 'write_table.csv', 
        head => [ 'ka', 'kb', 'cnt'], 
        #sep => ',', 
        #charset => 'utf8', 
    );

melt

数据调整,参考R语言的reshape2包

    #id => [1, 2, 3], 'somekey', sub { ... }
    #measure => [4,5,6, ...]

    my $r = melt('reshape_src.csv',
        skip_head => 1, 

        names => [ qw/day hour state cnt rank/ ], 

        #skip_sub => sub { $_[0][3]<1000 }, 
        id => [ 0, 1, 2 ],
        measure => [3, 4], 
        melt_file => '02.melt.csv',
        #return_arrayref => 0, 
    );

cast

数据重组,参考R语言的reshape2包

注意:

reduce_sub 是在读取数据的过程中处理value,默认是直接push到value列表里

stat_sub 是在数据读取完毕后,对value列表进行最终统计处理

id => 与melt同

measure => return 1 value,取值类似1,2,3,4

value => return 1 value,取值类似1,2,3,4

    my $r = cast('02.melt.csv', 
        cast_file => '03.cast.csv', 
        #return_arrayref => 0, 
        #write_head => 0, 

        #key 有 cnt / rank 两种
        names => [ qw/day hour state key value/ ], 
        id => [ 0, 1, 2 ],
        measure => 3, 
        value => 4, 

        result_names => [ qw/day hour state cnt rank/ ], 

        #default_cell_value => 0,

        stat_sub => sub { my ($vlist) = @_; my @temp = sort { $b <=> $a } @$vlist; return $temp[0] }, 

        #reduce_sub => sub { 
        #   my ($r) = @_;
        #   my $s=0 ; $s+= $_ for @$r; 
        #   return [ $s ];
        #   }, 
    );

merge

合并两个dataframe,在perl中是二层数组

    my $r = merge( 
        [ [qw/a b 1/], [qw/c d 2/] ], 
        [ [qw/a b 3/], [qw/c d 4/] ], 
        by => [ 0, 1], 
        value => [2], 
    );
    # $r = [["a", "b", 1, 3], ["c", "d", 2, 4]]

split_file

把一个大文件按指定id或行数拆分成多个小文件

    my $src_file = '06.split_file.log';

    split_file($src_file, id => [ 0 ] ,
        # sep => ',', 
        # split_file => '06.test.log', 
    );

    split_file($src_file, line_cnt => 400);

arrange

sort rows by some method

按指定方法,将所有数据按行重新排序

    my $r = arrange('reshape_src.csv', 
        skip_head => 1, 
        sep=> ',', 
        charset => 'utf8', 

        arrange_sub => sub { 
            $a->[4] <=> $b->[4] or
            $a->[3] <=> $b->[3] 
        }, 
        arrange_file => '07.arrange.csv', 
        return_arrayref => 1, 
        write_head => [ qw/day hour state cnt rank/ ], 
    );