@@ -1,3 +1,18 @@
+Revision history for Bio-ASN1-Entrezgene
+
+1.70 2013-09-14 14:39:54 America/Chicago
+ * Bio::ASN1::EntrezGene is now able to parse EntrezGene-set in which case
+ next_seq() will return the next set of sequences with each sequence as
+ an element in the array ref instead of an array ref with a single element.
+
+version 1.10: Important update if you see segmentation fault when running the
+ parser - so far I only saw it happen on Perl 5.8 (Perl 5.10 is
+ fine) due to an exceedingly long (and invalid) URL in one
+ Arabidopsis entry. It's due to Perl regex engine core dumps when
+ matching the long string exhausted the stack. I changed the
+ particular regex in EntrezGene.pm and Sequence.pm to solve the
+ issue. The overall parsing runs 2-3% faster after the change.
+
version 1.09: Added parser/indexer for NCBI's ASN.1-formatted
sequence files (like Genbank records).
Updated test, example scripts and documentation
@@ -0,0 +1,379 @@
+This software is copyright (c) 2013 by Mingyi Liu, GPC Biotech AG and Altana Research Institute.
+
+This is free software; you can redistribute it and/or modify it under
+the same terms as the Perl 5 programming language system itself.
+
+Terms of the Perl programming language system itself
+
+a) the GNU General Public License as published by the Free
+ Software Foundation; either version 1, or (at your option) any
+ later version, or
+b) the "Artistic License"
+
+--- The GNU General Public License, Version 1, February 1989 ---
+
+This software is Copyright (c) 2013 by Mingyi Liu, GPC Biotech AG and Altana Research Institute.
+
+This is free software, licensed under:
+
+ The GNU General Public License, Version 1, February 1989
+
+ GNU GENERAL PUBLIC LICENSE
+ Version 1, February 1989
+
+ Copyright (C) 1989 Free Software Foundation, Inc.
+ 51 Franklin St, Suite 500, Boston, MA 02110-1335 USA
+
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The license agreements of most software companies try to keep users
+at the mercy of those companies. By contrast, our General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users. The
+General Public License applies to the Free Software Foundation's
+software and to any other program whose authors commit to using it.
+You can use it for your programs, too.
+
+ When we speak of free software, we are referring to freedom, not
+price. Specifically, the General Public License is designed to make
+sure that you have the freedom to give away or sell copies of free
+software, that you receive source code or can get it if you want it,
+that you can change the software or use pieces of it in new free
+programs; and that you know you can do these things.
+
+ To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+ For example, if you distribute copies of a such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have. You must make sure that they, too, receive or can get the
+source code. And you must tell them their rights.
+
+ We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+ Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software. If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ GNU GENERAL PUBLIC LICENSE
+ TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+ 0. This License Agreement applies to any program or other work which
+contains a notice placed by the copyright holder saying it may be
+distributed under the terms of this General Public License. The
+"Program", below, refers to any such program or work, and a "work based
+on the Program" means either the Program or any work containing the
+Program or a portion of it, either verbatim or with modifications. Each
+licensee is addressed as "you".
+
+ 1. You may copy and distribute verbatim copies of the Program's source
+code as you receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice and
+disclaimer of warranty; keep intact all the notices that refer to this
+General Public License and to the absence of any warranty; and give any
+other recipients of the Program a copy of this General Public License
+along with the Program. You may charge a fee for the physical act of
+transferring a copy.
+
+ 2. You may modify your copy or copies of the Program or any portion of
+it, and copy and distribute such modifications under the terms of Paragraph
+1 above, provided that you also do the following:
+
+ a) cause the modified files to carry prominent notices stating that
+ you changed the files and the date of any change; and
+
+ b) cause the whole of any work that you distribute or publish, that
+ in whole or in part contains the Program or any part thereof, either
+ with or without modifications, to be licensed at no charge to all
+ third parties under the terms of this General Public License (except
+ that you may choose to grant warranty protection to some or all
+ third parties, at your option).
+
+ c) If the modified program normally reads commands interactively when
+ run, you must cause it, when started running for such interactive use
+ in the simplest and most usual way, to print or display an
+ announcement including an appropriate copyright notice and a notice
+ that there is no warranty (or else, saying that you provide a
+ warranty) and that users may redistribute the program under these
+ conditions, and telling the user how to view a copy of this General
+ Public License.
+
+ d) You may charge a fee for the physical act of transferring a
+ copy, and you may at your option offer warranty protection in
+ exchange for a fee.
+
+Mere aggregation of another independent work with the Program (or its
+derivative) on a volume of a storage or distribution medium does not bring
+the other work under the scope of these terms.
+
+ 3. You may copy and distribute the Program (or a portion or derivative of
+it, under Paragraph 2) in object code or executable form under the terms of
+Paragraphs 1 and 2 above provided that you also do one of the following:
+
+ a) accompany it with the complete corresponding machine-readable
+ source code, which must be distributed under the terms of
+ Paragraphs 1 and 2 above; or,
+
+ b) accompany it with a written offer, valid for at least three
+ years, to give any third party free (except for a nominal charge
+ for the cost of distribution) a complete machine-readable copy of the
+ corresponding source code, to be distributed under the terms of
+ Paragraphs 1 and 2 above; or,
+
+ c) accompany it with the information you received as to where the
+ corresponding source code may be obtained. (This alternative is
+ allowed only for noncommercial distribution and only if you
+ received the program in object code or executable form alone.)
+
+Source code for a work means the preferred form of the work for making
+modifications to it. For an executable file, complete source code means
+all the source code for all modules it contains; but, as a special
+exception, it need not include source code for modules which are standard
+libraries that accompany the operating system on which the executable
+file runs, or for standard header files or definitions files that
+accompany that operating system.
+
+ 4. You may not copy, modify, sublicense, distribute or transfer the
+Program except as expressly provided under this General Public License.
+Any attempt otherwise to copy, modify, sublicense, distribute or transfer
+the Program is void, and will automatically terminate your rights to use
+the Program under this License. However, parties who have received
+copies, or rights to use copies, from you under this General Public
+License will not have their licenses terminated so long as such parties
+remain in full compliance.
+
+ 5. By copying, distributing or modifying the Program (or any work based
+on the Program) you indicate your acceptance of this license to do so,
+and all its terms and conditions.
+
+ 6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the original
+licensor to copy, distribute or modify the Program subject to these
+terms and conditions. You may not impose any further restrictions on the
+recipients' exercise of the rights granted herein.
+
+ 7. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time. Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number. If the Program
+specifies a version number of the license which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation. If the Program does not specify a version number of
+the license, you may choose any version ever published by the Free Software
+Foundation.
+
+ 8. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission. For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this. Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+ NO WARRANTY
+
+ 9. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+ 10. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+ END OF TERMS AND CONDITIONS
+
+ Appendix: How to Apply These Terms to Your New Programs
+
+ If you develop a new program, and you want it to be of the greatest
+possible use to humanity, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these
+terms.
+
+ To do so, attach the following notices to the program. It is safest to
+attach them to the start of each source file to most effectively convey
+the exclusion of warranty; and each file should have at least the
+"copyright" line and a pointer to where the full notice is found.
+
+ <one line to give the program's name and a brief idea of what it does.>
+ Copyright (C) 19yy <name of author>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 1, or (at your option)
+ any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301 USA
+
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+ Gnomovision version 69, Copyright (C) 19xx name of author
+ Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+ This is free software, and you are welcome to redistribute it
+ under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the
+appropriate parts of the General Public License. Of course, the
+commands you use may be called something other than `show w' and `show
+c'; they could even be mouse-clicks or menu items--whatever suits your
+program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary. Here a sample; alter the names:
+
+ Yoyodyne, Inc., hereby disclaims all copyright interest in the
+ program `Gnomovision' (a program to direct compilers to make passes
+ at assemblers) written by James Hacker.
+
+ <signature of Ty Coon>, 1 April 1989
+ Ty Coon, President of Vice
+
+That's all there is to it!
+
+
+--- The Artistic License 1.0 ---
+
+This software is Copyright (c) 2013 by Mingyi Liu, GPC Biotech AG and Altana Research Institute.
+
+This is free software, licensed under:
+
+ The Artistic License 1.0
+
+The Artistic License
+
+Preamble
+
+The intent of this document is to state the conditions under which a Package
+may be copied, such that the Copyright Holder maintains some semblance of
+artistic control over the development of the package, while giving the users of
+the package the right to use and distribute the Package in a more-or-less
+customary fashion, plus the right to make reasonable modifications.
+
+Definitions:
+
+ - "Package" refers to the collection of files distributed by the Copyright
+ Holder, and derivatives of that collection of files created through
+ textual modification.
+ - "Standard Version" refers to such a Package if it has not been modified,
+ or has been modified in accordance with the wishes of the Copyright
+ Holder.
+ - "Copyright Holder" is whoever is named in the copyright or copyrights for
+ the package.
+ - "You" is you, if you're thinking about copying or distributing this Package.
+ - "Reasonable copying fee" is whatever you can justify on the basis of media
+ cost, duplication charges, time of people involved, and so on. (You will
+ not be required to justify it to the Copyright Holder, but only to the
+ computing community at large as a market that must bear the fee.)
+ - "Freely Available" means that no fee is charged for the item itself, though
+ there may be fees involved in handling the item. It also means that
+ recipients of the item may redistribute it under the same conditions they
+ received it.
+
+1. You may make and give away verbatim copies of the source form of the
+Standard Version of this Package without restriction, provided that you
+duplicate all of the original copyright notices and associated disclaimers.
+
+2. You may apply bug fixes, portability fixes and other modifications derived
+from the Public Domain or from the Copyright Holder. A Package modified in such
+a way shall still be considered the Standard Version.
+
+3. You may otherwise modify your copy of this Package in any way, provided that
+you insert a prominent notice in each changed file stating how and when you
+changed that file, and provided that you do at least ONE of the following:
+
+ a) place your modifications in the Public Domain or otherwise make them
+ Freely Available, such as by posting said modifications to Usenet or an
+ equivalent medium, or placing the modifications on a major archive site
+ such as ftp.uu.net, or by allowing the Copyright Holder to include your
+ modifications in the Standard Version of the Package.
+
+ b) use the modified Package only within your corporation or organization.
+
+ c) rename any non-standard executables so the names do not conflict with
+ standard executables, which must also be provided, and provide a separate
+ manual page for each non-standard executable that clearly documents how it
+ differs from the Standard Version.
+
+ d) make other distribution arrangements with the Copyright Holder.
+
+4. You may distribute the programs of this Package in object code or executable
+form, provided that you do at least ONE of the following:
+
+ a) distribute a Standard Version of the executables and library files,
+ together with instructions (in the manual page or equivalent) on where to
+ get the Standard Version.
+
+ b) accompany the distribution with the machine-readable source of the Package
+ with your modifications.
+
+ c) accompany any non-standard executables with their corresponding Standard
+ Version executables, giving the non-standard executables non-standard
+ names, and clearly documenting the differences in manual pages (or
+ equivalent), together with instructions on where to get the Standard
+ Version.
+
+ d) make other distribution arrangements with the Copyright Holder.
+
+5. You may charge a reasonable copying fee for any distribution of this
+Package. You may charge any fee you choose for support of this Package. You
+may not charge a fee for this Package itself. However, you may distribute this
+Package in aggregate with other (possibly commercial) programs as part of a
+larger (possibly commercial) software distribution provided that you do not
+advertise this Package as a product of your own.
+
+6. The scripts and library files supplied as input to or produced as output
+from the programs of this Package do not automatically fall under the copyright
+of this Package, but belong to whomever generated them, and may be sold
+commercially, and may be aggregated with this Package.
+
+7. C or perl subroutines supplied by you and linked into this Package shall not
+be considered part of this Package.
+
+8. The name of the Copyright Holder may not be used to endorse or promote
+products derived from this software without specific prior written permission.
+
+9. THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
+WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
+MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
+
+The End
+
@@ -1,17 +1,27 @@
Changes
+LICENSE
MANIFEST
-README
-lib/Bio/ASN1/EntrezGene.pm
-lib/Bio/ASN1/EntrezGene/Indexer.pm
-lib/Bio/ASN1/Sequence.pm
-lib/Bio/ASN1/Sequence/Indexer.pm
+META.json
+META.yml
Makefile.PL
-examples/regex_parser_test.pl
+README.md
+dist.ini
examples/indexer_test.pl
examples/parse_entrez_gene_example.pl
examples/parse_sequence_example.pl
+examples/regex_parser_test.pl
+lib/Bio/ASN1/EntrezGene.pm
+lib/Bio/ASN1/EntrezGene/Indexer.pm
+lib/Bio/ASN1/Sequence.pm
+lib/Bio/ASN1/Sequence/Indexer.pm
+t/00-compile.t
t/input.asn
t/input1.asn
-t/testparser.t
-t/testindexer.t
+t/release-eol.t
+t/release-mojibake.t
+t/release-no-tabs.t
+t/release-pod-coverage.t
+t/release-pod-syntax.t
t/seq.asn
+t/testindexer.t
+t/testparser.t
@@ -0,0 +1,273 @@
+{
+ "abstract" : "Regular expression-based Perl Parser for NCBI Entrez Gene.",
+ "author" : [
+ "Mingyi Liu <mingyiliu@gmail.com>"
+ ],
+ "dynamic_config" : 0,
+ "generated_by" : "Dist::Zilla version 4.300037, CPAN::Meta::Converter version 2.120921",
+ "license" : [
+ "perl_5"
+ ],
+ "meta-spec" : {
+ "url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec",
+ "version" : "2"
+ },
+ "name" : "Bio-ASN1-EntrezGene",
+ "prereqs" : {
+ "configure" : {
+ "requires" : {
+ "ExtUtils::MakeMaker" : "6.30"
+ }
+ },
+ "develop" : {
+ "requires" : {
+ "Pod::Coverage::TrustPod" : "0",
+ "Test::Pod" : "1.41",
+ "Test::Pod::Coverage" : "1.08"
+ }
+ },
+ "runtime" : {
+ "requires" : {
+ "Bio::Index::AbstractSeq" : "0",
+ "Carp" : "0",
+ "parent" : "0",
+ "strict" : "0",
+ "utf8" : "0",
+ "warnings" : "0"
+ }
+ },
+ "test" : {
+ "requires" : {
+ "File::Spec" : "0",
+ "IO::Handle" : "0",
+ "IPC::Open3" : "0",
+ "Test::More" : "0"
+ }
+ }
+ },
+ "release_status" : "stable",
+ "resources" : {
+ "bugtracker" : {
+ "mailto" : "bioperl-l@bioperl.org",
+ "web" : "https://redmine.open-bio.org/projects/bioperl/"
+ },
+ "homepage" : "http://search.cpan.org/dist/Bio-ASN1-EntrezGene",
+ "repository" : {
+ "type" : "git",
+ "url" : "git://github.com/bioperl/bio-asn1-entrezgene.git",
+ "web" : "https://github.com/bioperl/bio-asn1-entrezgene"
+ }
+ },
+ "version" : "1.70",
+ "x_Dist_Zilla" : {
+ "perl" : {
+ "version" : "5.018001"
+ },
+ "plugins" : [
+ {
+ "class" : "Dist::Zilla::Plugin::GatherDir",
+ "name" : "@BioPerl/@Filter/GatherDir",
+ "version" : "4.300037"
+ },
+ {
+ "class" : "Dist::Zilla::Plugin::PruneCruft",
+ "name" : "@BioPerl/@Filter/PruneCruft",
+ "version" : "4.300037"
+ },
+ {
+ "class" : "Dist::Zilla::Plugin::ManifestSkip",
+ "name" : "@BioPerl/@Filter/ManifestSkip",
+ "version" : "4.300037"
+ },
+ {
+ "class" : "Dist::Zilla::Plugin::MetaYAML",
+ "name" : "@BioPerl/@Filter/MetaYAML",
+ "version" : "4.300037"
+ },
+ {
+ "class" : "Dist::Zilla::Plugin::License",
+ "name" : "@BioPerl/@Filter/License",
+ "version" : "4.300037"
+ },
+ {
+ "class" : "Dist::Zilla::Plugin::ExtraTests",
+ "name" : "@BioPerl/@Filter/ExtraTests",
+ "version" : "4.300037"
+ },
+ {
+ "class" : "Dist::Zilla::Plugin::ExecDir",
+ "name" : "@BioPerl/@Filter/ExecDir",
+ "version" : "4.300037"
+ },
+ {
+ "class" : "Dist::Zilla::Plugin::ShareDir",
+ "name" : "@BioPerl/@Filter/ShareDir",
+ "version" : "4.300037"
+ },
+ {
+ "class" : "Dist::Zilla::Plugin::MakeMaker",
+ "name" : "@BioPerl/@Filter/MakeMaker",
+ "version" : "4.300037"
+ },
+ {
+ "class" : "Dist::Zilla::Plugin::Manifest",
+ "name" : "@BioPerl/@Filter/Manifest",
+ "version" : "4.300037"
+ },
+ {
+ "class" : "Dist::Zilla::Plugin::TestRelease",
+ "name" : "@BioPerl/@Filter/TestRelease",
+ "version" : "4.300037"
+ },
+ {
+ "class" : "Dist::Zilla::Plugin::ConfirmRelease",
+ "name" : "@BioPerl/@Filter/ConfirmRelease",
+ "version" : "4.300037"
+ },
+ {
+ "class" : "Dist::Zilla::Plugin::UploadToCPAN",
+ "name" : "@BioPerl/@Filter/UploadToCPAN",
+ "version" : "4.300037"
+ },
+ {
+ "class" : "Dist::Zilla::Plugin::MetaConfig",
+ "name" : "@BioPerl/MetaConfig",
+ "version" : "4.300037"
+ },
+ {
+ "class" : "Dist::Zilla::Plugin::MetaJSON",
+ "name" : "@BioPerl/MetaJSON",
+ "version" : "4.300037"
+ },
+ {
+ "class" : "Dist::Zilla::Plugin::PkgVersion",
+ "name" : "@BioPerl/PkgVersion",
+ "version" : "4.300037"
+ },
+ {
+ "class" : "Dist::Zilla::Plugin::PodSyntaxTests",
+ "name" : "@BioPerl/PodSyntaxTests",
+ "version" : "4.300037"
+ },
+ {
+ "class" : "Dist::Zilla::Plugin::NoTabsTests",
+ "name" : "@BioPerl/NoTabsTests",
+ "version" : "0.01"
+ },
+ {
+ "class" : "Dist::Zilla::Plugin::NextRelease",
+ "name" : "@BioPerl/NextRelease",
+ "version" : "4.300037"
+ },
+ {
+ "class" : "Dist::Zilla::Plugin::Test::Compile",
+ "config" : {
+ "Dist::Zilla::Plugin::Test::Compile" : {
+ "module_finder" : [
+ ":InstallModules"
+ ],
+ "script_finder" : [
+ ":ExecFiles"
+ ]
+ }
+ },
+ "name" : "@BioPerl/Test::Compile",
+ "version" : "2.027"
+ },
+ {
+ "class" : "Dist::Zilla::Plugin::PodCoverageTests",
+ "name" : "@BioPerl/PodCoverageTests",
+ "version" : "4.300037"
+ },
+ {
+ "class" : "Dist::Zilla::Plugin::MojibakeTests",
+ "name" : "@BioPerl/MojibakeTests",
+ "version" : "0.5"
+ },
+ {
+ "class" : "Dist::Zilla::Plugin::AutoPrereqs",
+ "name" : "@BioPerl/AutoPrereqs",
+ "version" : "4.300037"
+ },
+ {
+ "class" : "Dist::Zilla::Plugin::AutoMetaResources",
+ "name" : "@BioPerl/AutoMetaResources",
+ "version" : "1.20"
+ },
+ {
+ "class" : "Dist::Zilla::Plugin::MetaResources",
+ "name" : "@BioPerl/MetaResources",
+ "version" : "4.300037"
+ },
+ {
+ "class" : "Dist::Zilla::Plugin::Authority",
+ "name" : "@BioPerl/Authority",
+ "version" : "1.006"
+ },
+ {
+ "class" : "Dist::Zilla::Plugin::EOLTests",
+ "name" : "@BioPerl/EOLTests",
+ "version" : "0.02"
+ },
+ {
+ "class" : "Dist::Zilla::Plugin::PodWeaver",
+ "name" : "@BioPerl/PodWeaver",
+ "version" : "3.101642"
+ },
+ {
+ "class" : "Dist::Zilla::Plugin::Git::Check",
+ "name" : "@BioPerl/Git::Check",
+ "version" : "2.014"
+ },
+ {
+ "class" : "Dist::Zilla::Plugin::Git::Commit",
+ "name" : "@BioPerl/Git::Commit",
+ "version" : "2.014"
+ },
+ {
+ "class" : "Dist::Zilla::Plugin::Git::Tag",
+ "name" : "@BioPerl/Git::Tag",
+ "version" : "2.014"
+ },
+ {
+ "class" : "Dist::Zilla::Plugin::FinderCode",
+ "name" : ":InstallModules",
+ "version" : "4.300037"
+ },
+ {
+ "class" : "Dist::Zilla::Plugin::FinderCode",
+ "name" : ":IncModules",
+ "version" : "4.300037"
+ },
+ {
+ "class" : "Dist::Zilla::Plugin::FinderCode",
+ "name" : ":TestFiles",
+ "version" : "4.300037"
+ },
+ {
+ "class" : "Dist::Zilla::Plugin::FinderCode",
+ "name" : ":ExecFiles",
+ "version" : "4.300037"
+ },
+ {
+ "class" : "Dist::Zilla::Plugin::FinderCode",
+ "name" : ":ShareFiles",
+ "version" : "4.300037"
+ },
+ {
+ "class" : "Dist::Zilla::Plugin::FinderCode",
+ "name" : ":MainModule",
+ "version" : "4.300037"
+ }
+ ],
+ "zilla" : {
+ "class" : "Dist::Zilla::Dist::Builder",
+ "config" : {
+ "is_trial" : "0"
+ },
+ "version" : "4.300037"
+ }
+ },
+ "x_authority" : "cpan:BIOPERLML"
+}
+
@@ -0,0 +1,194 @@
+---
+abstract: 'Regular expression-based Perl Parser for NCBI Entrez Gene.'
+author:
+ - 'Mingyi Liu <mingyiliu@gmail.com>'
+build_requires:
+ File::Spec: 0
+ IO::Handle: 0
+ IPC::Open3: 0
+ Test::More: 0
+configure_requires:
+ ExtUtils::MakeMaker: 6.30
+dynamic_config: 0
+generated_by: 'Dist::Zilla version 4.300037, CPAN::Meta::Converter version 2.120921'
+license: perl
+meta-spec:
+ url: http://module-build.sourceforge.net/META-spec-v1.4.html
+ version: 1.4
+name: Bio-ASN1-EntrezGene
+requires:
+ Bio::Index::AbstractSeq: 0
+ Carp: 0
+ parent: 0
+ strict: 0
+ utf8: 0
+ warnings: 0
+resources:
+ bugtracker: https://redmine.open-bio.org/projects/bioperl/
+ homepage: http://search.cpan.org/dist/Bio-ASN1-EntrezGene
+ repository: git://github.com/bioperl/bio-asn1-entrezgene.git
+version: 1.70
+x_Dist_Zilla:
+ perl:
+ version: 5.018001
+ plugins:
+ -
+ class: Dist::Zilla::Plugin::GatherDir
+ name: '@BioPerl/@Filter/GatherDir'
+ version: 4.300037
+ -
+ class: Dist::Zilla::Plugin::PruneCruft
+ name: '@BioPerl/@Filter/PruneCruft'
+ version: 4.300037
+ -
+ class: Dist::Zilla::Plugin::ManifestSkip
+ name: '@BioPerl/@Filter/ManifestSkip'
+ version: 4.300037
+ -
+ class: Dist::Zilla::Plugin::MetaYAML
+ name: '@BioPerl/@Filter/MetaYAML'
+ version: 4.300037
+ -
+ class: Dist::Zilla::Plugin::License
+ name: '@BioPerl/@Filter/License'
+ version: 4.300037
+ -
+ class: Dist::Zilla::Plugin::ExtraTests
+ name: '@BioPerl/@Filter/ExtraTests'
+ version: 4.300037
+ -
+ class: Dist::Zilla::Plugin::ExecDir
+ name: '@BioPerl/@Filter/ExecDir'
+ version: 4.300037
+ -
+ class: Dist::Zilla::Plugin::ShareDir
+ name: '@BioPerl/@Filter/ShareDir'
+ version: 4.300037
+ -
+ class: Dist::Zilla::Plugin::MakeMaker
+ name: '@BioPerl/@Filter/MakeMaker'
+ version: 4.300037
+ -
+ class: Dist::Zilla::Plugin::Manifest
+ name: '@BioPerl/@Filter/Manifest'
+ version: 4.300037
+ -
+ class: Dist::Zilla::Plugin::TestRelease
+ name: '@BioPerl/@Filter/TestRelease'
+ version: 4.300037
+ -
+ class: Dist::Zilla::Plugin::ConfirmRelease
+ name: '@BioPerl/@Filter/ConfirmRelease'
+ version: 4.300037
+ -
+ class: Dist::Zilla::Plugin::UploadToCPAN
+ name: '@BioPerl/@Filter/UploadToCPAN'
+ version: 4.300037
+ -
+ class: Dist::Zilla::Plugin::MetaConfig
+ name: '@BioPerl/MetaConfig'
+ version: 4.300037
+ -
+ class: Dist::Zilla::Plugin::MetaJSON
+ name: '@BioPerl/MetaJSON'
+ version: 4.300037
+ -
+ class: Dist::Zilla::Plugin::PkgVersion
+ name: '@BioPerl/PkgVersion'
+ version: 4.300037
+ -
+ class: Dist::Zilla::Plugin::PodSyntaxTests
+ name: '@BioPerl/PodSyntaxTests'
+ version: 4.300037
+ -
+ class: Dist::Zilla::Plugin::NoTabsTests
+ name: '@BioPerl/NoTabsTests'
+ version: 0.01
+ -
+ class: Dist::Zilla::Plugin::NextRelease
+ name: '@BioPerl/NextRelease'
+ version: 4.300037
+ -
+ class: Dist::Zilla::Plugin::Test::Compile
+ config:
+ Dist::Zilla::Plugin::Test::Compile:
+ module_finder:
+ - ':InstallModules'
+ script_finder:
+ - ':ExecFiles'
+ name: '@BioPerl/Test::Compile'
+ version: 2.027
+ -
+ class: Dist::Zilla::Plugin::PodCoverageTests
+ name: '@BioPerl/PodCoverageTests'
+ version: 4.300037
+ -
+ class: Dist::Zilla::Plugin::MojibakeTests
+ name: '@BioPerl/MojibakeTests'
+ version: 0.5
+ -
+ class: Dist::Zilla::Plugin::AutoPrereqs
+ name: '@BioPerl/AutoPrereqs'
+ version: 4.300037
+ -
+ class: Dist::Zilla::Plugin::AutoMetaResources
+ name: '@BioPerl/AutoMetaResources'
+ version: 1.20
+ -
+ class: Dist::Zilla::Plugin::MetaResources
+ name: '@BioPerl/MetaResources'
+ version: 4.300037
+ -
+ class: Dist::Zilla::Plugin::Authority
+ name: '@BioPerl/Authority'
+ version: 1.006
+ -
+ class: Dist::Zilla::Plugin::EOLTests
+ name: '@BioPerl/EOLTests'
+ version: 0.02
+ -
+ class: Dist::Zilla::Plugin::PodWeaver
+ name: '@BioPerl/PodWeaver'
+ version: 3.101642
+ -
+ class: Dist::Zilla::Plugin::Git::Check
+ name: '@BioPerl/Git::Check'
+ version: 2.014
+ -
+ class: Dist::Zilla::Plugin::Git::Commit
+ name: '@BioPerl/Git::Commit'
+ version: 2.014
+ -
+ class: Dist::Zilla::Plugin::Git::Tag
+ name: '@BioPerl/Git::Tag'
+ version: 2.014
+ -
+ class: Dist::Zilla::Plugin::FinderCode
+ name: ':InstallModules'
+ version: 4.300037
+ -
+ class: Dist::Zilla::Plugin::FinderCode
+ name: ':IncModules'
+ version: 4.300037
+ -
+ class: Dist::Zilla::Plugin::FinderCode
+ name: ':TestFiles'
+ version: 4.300037
+ -
+ class: Dist::Zilla::Plugin::FinderCode
+ name: ':ExecFiles'
+ version: 4.300037
+ -
+ class: Dist::Zilla::Plugin::FinderCode
+ name: ':ShareFiles'
+ version: 4.300037
+ -
+ class: Dist::Zilla::Plugin::FinderCode
+ name: ':MainModule'
+ version: 4.300037
+ zilla:
+ class: Dist::Zilla::Dist::Builder
+ config:
+ is_trial: 0
+ version: 4.300037
+x_authority: cpan:BIOPERLML
@@ -1,10 +1,75 @@
-require 5.004;
+
use strict;
-use ExtUtils::MakeMaker;
+use warnings;
+
+
+
+use ExtUtils::MakeMaker 6.30;
-WriteMakefile(
- NAME => 'Bio::ASN1::EntrezGene',
- VERSION_FROM => 'lib/Bio/ASN1/EntrezGene.pm',
- AUTHOR => 'Mingyi Liu (mingyi.liu[at]gpc-biotech.com)',
- PREREQ_PM => {'Bio::Index::AbstractSeq' => '0'}
+
+
+my %WriteMakefileArgs = (
+ "ABSTRACT" => "Regular expression-based Perl Parser for NCBI Entrez Gene.",
+ "AUTHOR" => "Mingyi Liu <mingyiliu\@gmail.com>",
+ "BUILD_REQUIRES" => {},
+ "CONFIGURE_REQUIRES" => {
+ "ExtUtils::MakeMaker" => "6.30"
+ },
+ "DISTNAME" => "Bio-ASN1-EntrezGene",
+ "EXE_FILES" => [],
+ "LICENSE" => "perl",
+ "NAME" => "Bio::ASN1::EntrezGene",
+ "PREREQ_PM" => {
+ "Bio::Index::AbstractSeq" => 0,
+ "Carp" => 0,
+ "parent" => 0,
+ "strict" => 0,
+ "utf8" => 0,
+ "warnings" => 0
+ },
+ "TEST_REQUIRES" => {
+ "File::Spec" => 0,
+ "IO::Handle" => 0,
+ "IPC::Open3" => 0,
+ "Test::More" => 0
+ },
+ "VERSION" => "1.70",
+ "test" => {
+ "TESTS" => "t/*.t"
+ }
);
+
+
+unless ( eval { ExtUtils::MakeMaker->VERSION(6.63_03) } ) {
+ my $tr = delete $WriteMakefileArgs{TEST_REQUIRES};
+ my $br = $WriteMakefileArgs{BUILD_REQUIRES};
+ for my $mod ( keys %$tr ) {
+ if ( exists $br->{$mod} ) {
+ $br->{$mod} = $tr->{$mod} if $tr->{$mod} > $br->{$mod};
+ }
+ else {
+ $br->{$mod} = $tr->{$mod};
+ }
+ }
+}
+
+unless ( eval { ExtUtils::MakeMaker->VERSION(6.56) } ) {
+ my $br = delete $WriteMakefileArgs{BUILD_REQUIRES};
+ my $pp = $WriteMakefileArgs{PREREQ_PM};
+ for my $mod ( keys %$br ) {
+ if ( exists $pp->{$mod} ) {
+ $pp->{$mod} = $br->{$mod} if $br->{$mod} > $pp->{$mod};
+ }
+ else {
+ $pp->{$mod} = $br->{$mod};
+ }
+ }
+}
+
+delete $WriteMakefileArgs{CONFIGURE_REQUIRES}
+ unless eval { ExtUtils::MakeMaker->VERSION(6.52) };
+
+WriteMakefile(%WriteMakefileArgs);
+
+
+
@@ -1,34 +0,0 @@
-This is the README file for Bio::ASN1::EntrezGene package, which
-includes:
-1. XML parser-like parser for the ASN.1-formatted NCBI Entrez Gene files.
-2. Indexer for Entrez Gene files.
-3. XML parser-like parser for the ASN.1-formatted NCBI Sequence files.
-4. Indexer for Sequence files.
-
-These modules have quite high performance and error reporting capabilities.
-Additionally, one could dump the data structure generated from extracted
-NCBI object records into XML extremely easily using XML::Simple's XMLout().
-
-Written and maintained by Dr. Mingyi Liu <mingyi.liu@gpc-biotech.com>.
-Copyright (c) 2005 Mingyi Liu, GPC Biotech, Altana Research Institute.
-All rights reserved.
-
-This program is free software - you can redistribute it and/or modify
-it under the same terms as Perl itself.
-
-INSTALLATION
-
-Bio::ASN1::EntrezGene package can be installed & tested as follows:
-
-perl Makefile.PL
-make
-make test
-make install
-
-DOCUMENTATION
-
-For documentation, among many other things, please refer to the POD (
-plain old documentation) inside the module.
-
-It is highly recommended that you check the example scripts out (under
-the examples directory)!
@@ -1,5 +0,0 @@
-Please note that Bio::ASN1::EntrezGene package version 1.091 is exactly the
-same as version 1.09 (the only difference being that this file only exists
-in version 1.091). The reason for releasing a version 1.091 is that CPAN
-indexing of Bio::ASN1::EntrezGene version 1.09 had some problem that is not
-yet fixed & I'd have to upload a package with different file name.
@@ -0,0 +1,41 @@
+Bio-ASN1-Entrezgene
+===================
+
+This distribution includes:
+1. XML parser-like parser for the ASN.1-formatted NCBI Entrez Gene files.
+2. Indexer for Entrez Gene files.
+3. XML parser-like parser for the ASN.1-formatted NCBI Sequence files.
+4. Indexer for Sequence files.
+
+These modules have quite high performance and error reporting capabilities.
+Additionally, one could dump the data structure generated from extracted
+NCBI object records into XML extremely easily using XML::Simple's XMLout().
+
+Written by Dr. Mingyi Liu <mingyiliu@gmail.com>.
+Copyright (c) 2005 Mingyi Liu, GPC Biotech, Altana Research Institute.
+
+This program is free software - you can redistribute it and/or modify
+it under the same terms as Perl itself.
+
+INSTALLATION
+------------
+
+Bio::ASN1::EntrezGene package can be installed & tested as follows:
+
+ perl Makefile.PL
+ make
+ make test
+ make install
+
+DOCUMENTATION
+-------------
+
+For documentation, among many other things, please refer to the POD (
+plain old documentation) inside the module.
+
+It is highly recommended that you check the example scripts out (under
+the examples directory)!
+
+- - -
+
+This distribution is part of the [BioPerl](http://www.bioperl.org/) project.
@@ -0,0 +1,8 @@
+name = Bio-ASN1-EntrezGene
+version = 1.70
+author = Mingyi Liu <mingyiliu@gmail.com>
+license = Perl_5
+copyright_holder = Mingyi Liu, GPC Biotech AG and Altana Research Institute
+copyright_year = 2013
+
+[@BioPerl]
@@ -582,8 +582,9 @@ sub safeassign
# safely extracts a value, another choice is to simply use
# eval in-line, if it fails, it fails. Probably faster, but can't
# give feedback in-line (always has to add a couple lines dealing with
-# $@ for error reporting), might still be worth it though because
+# $@ for error reporting), might still be worth it though because
# of the speed. User can make his/her own choice here.
+
sub safeval
{
my ($ds, $str) = @_; # data structure and string (we need $ds passed in because we use strict)
@@ -1,105 +1,42 @@
-=head1 NAME
-
-Bio::ASN1::EntrezGene::Indexer - Indexes NCBI Entrez Gene files.
-
-=head1 SYNOPSIS
-
- use Bio::ASN1::EntrezGene::Indexer;
-
- # creating & using the index is just a few lines
- my $inx = Bio::ASN1::EntrezGene::Indexer->new(
- -filename => 'entrezgene.idx',
- -write_flag => 'WRITE'); # needed for make_index call, but if opening
- # existing index file, don't set write flag!
- $inx->make_index('Homo_sapiens', 'Mus_musculus', 'Rattus_norvegicus');
- my $seq = $inx->fetch(10); # Bio::Seq obj for Entrez Gene #10
- # alternatively, if one prefers just a data structure instead of objects
- $seq = $inx->fetch_hash(10); # a hash produced by Bio::ASN1::EntrezGene
- # that contains all data in the Entrez Gene record
-
- # note that in case you wonder, you can get the files 'Homo_sapiens'
- # from NCBI Entrez Gene ftp download, DATA/ASN/Mammalia directory
-
-=head1 PREREQUISITE
-
-Bio::ASN1::EntrezGene, Bioperl version that contains Stefan Kirov's
-entrezgene.pm and all dependencies therein.
-
-=head1 INSTALLATION
-
-Same as Bio::ASN1::EntrezGene
-
-=head1 DESCRIPTION
-
-Bio::ASN1::EntrezGene::Indexer is a Perl Indexer for NCBI Entrez Gene genome
-databases. It processes an ASN.1-formatted Entrez Gene record and stores the
-file position for each record in a way compliant with Bioperl standard (in
-fact its a subclass of Bioperl's index objects).
-
-Note that this module does not parse record, because it needs to run fast and
-grab only the gene ids. For parsing record, use Bio::ASN1::EntrezGene, or
-better yet, use Bio::SeqIO, format 'entrezgene'.
-
-It takes this module (version 1.07) 21 seconds to index the human genome
-Entrez Gene file (Apr. 5/2005 download) on one 2.4 GHz Intel Xeon processor.
-
-=head1 SEE ALSO
-
-For details on various parsers I generated for Entrez Gene, example scripts that
-uses/benchmarks the modules, please see L<http://sourceforge.net/projects/egparser/>.
-Those other parsers etc. are included in V1.05 download.
-
-=head1 AUTHOR
-
-Dr. Mingyi Liu <mingyi.liu@gpc-biotech.com>
-
-=head1 COPYRIGHT
-
-The Bio::ASN1::EntrezGene module and its related modules and scripts
-are copyright (c) 2005 Mingyi Liu, GPC Biotech AG and Altana Research
-Institute. All rights reserved. I created these modules when working
-on a collaboration project between these two companies. Therefore a
-special thanks for the two companies to allow the release of the code
-into public domain.
-
-You may use and distribute them under the terms of the Perl itself or
-GPL (L<http://www.gnu.org/copyleft/gpl.html>).
-
-=head1 CITATION
-
-Liu, M and Grigoriev, A (2005) "Fast Parsers for Entrez Gene"
-Bioinformatics. In press
-
-=head1 OPERATION SYSTEMS SUPPORTED
-
-Any OS that Perl & Bioperl run on.
-
-=head1 METHODS
-
-=cut
-
package Bio::ASN1::EntrezGene::Indexer;
-
+BEGIN {
+ $Bio::ASN1::EntrezGene::Indexer::AUTHORITY = 'cpan:BIOPERLML';
+}
+{
+ $Bio::ASN1::EntrezGene::Indexer::VERSION = '1.70';
+}
+use utf8;
use strict;
+use warnings;
use Carp qw(carp croak);
-use vars qw ($VERSION @ISA);
use Bio::ASN1::EntrezGene;
use Bio::Index::AbstractSeq;
+use parent qw(Bio::Index::AbstractSeq);
+
+# ABSTRACT: Indexes NCBI Sequence files.
+# AUTHOR: Dr. Mingyi Liu <mingyiliu@gmail.com>
+# OWNER: 2005 Mingyi Liu
+# OWNER: 2005 GPC Biotech AG
+# OWNER: 2005 Altana Research Institute
+# LICENSE: Perl_5
-@ISA = qw(Bio::Index::AbstractSeq);
-$VERSION = '1.09';
+
+
+# TODO: Should this be deprecated?
sub _version
{
- return $VERSION;
+ return $Bio::Index::AbstractSeq::VERSION;
}
+
sub _type_stamp
{
return '__EntrezGene_ASN1__';
}
-sub _index_file
+
+sub _index_file
{
my($self, $file, $idx) = @_;
my $position;
@@ -115,22 +52,100 @@ sub _index_file
return 1;
}
+
sub _file_format
{
return 'entrezgene';
}
+
+
+sub fetch_hash
+{
+ my ($self, $geneid) = @_;
+ if (my $gene = $self->db->{$geneid})
+ {
+ my ($fileno, $position) = $self->unpack_record($gene);
+ my $parser = Bio::ASN1::EntrezGene->new('fh' => $self->_file_handle($fileno));
+ seek($parser->fh, $position, 0);
+ return $parser->next_seq;
+ }
+}
+
+
+sub _file_handle {
+ my( $self, $i ) = @_;
+
+ unless ($self->{'_filehandle'}[$i]) {
+ my @rec = $self->unpack_record($self->db->{"__FILE_$i"})
+ or $self->throw("Can't get filename for index : $i");
+ my $file = $rec[0];
+ local *FH;
+ open *FH, $file or $self->throw("Can't read file '$file' : $!");
+ $self->{'_filehandle'}[$i] = *FH; # Cache filehandle
+ }
+ return $self->{'_filehandle'}[$i];
+}
+
+1;
+
+__END__
+
+=pod
+
+=encoding utf-8
+
+=head1 NAME
+
+Bio::ASN1::EntrezGene::Indexer - Indexes NCBI Sequence files.
+
+=head1 VERSION
+
+version 1.70
+
+=head1 SYNOPSIS
+
+ use Bio::ASN1::EntrezGene::Indexer;
+
+ # creating & using the index is just a few lines
+ my $inx = Bio::ASN1::EntrezGene::Indexer->new(
+ -filename => 'entrezgene.idx',
+ -write_flag => 'WRITE'); # needed for make_index call, but if opening
+ # existing index file, don't set write flag!
+ $inx->make_index('Homo_sapiens', 'Mus_musculus', 'Rattus_norvegicus');
+ my $seq = $inx->fetch(10); # Bio::Seq obj for Entrez Gene #10
+ # alternatively, if one prefers just a data structure instead of objects
+ $seq = $inx->fetch_hash(10); # a hash produced by Bio::ASN1::EntrezGene
+ # that contains all data in the Entrez Gene record
+
+ # note that in case you wonder, you can get the files 'Homo_sapiens'
+ # from NCBI Entrez Gene ftp download, DATA/ASN/Mammalia directory
+
+=head1 DESCRIPTION
+
+Bio::ASN1::EntrezGene::Indexer is a Perl Indexer for NCBI Entrez Gene genome
+databases. It processes an ASN.1-formatted Entrez Gene record and stores the
+file position for each record in a way compliant with Bioperl standard (in
+fact its a subclass of Bioperl's index objects).
+
+Note that this module does not parse record, because it needs to run fast and
+grab only the gene ids. For parsing record, use Bio::ASN1::EntrezGene, or
+better yet, use Bio::SeqIO, format 'entrezgene'.
+
+It takes this module (version 1.07) 21 seconds to index the human genome
+Entrez Gene file (Apr. 5/2005 download) on one 2.4 GHz Intel Xeon processor.
+
+=head1 METHODS
+
=head2 fetch
Parameters: $geneid - id for the Entrez Gene record to be retrieved
Example: my $hash = $indexer->fetch(10); # get Entrez Gene #10
Function: fetch the data for the given Entrez Gene id.
Returns: A Bio::Seq object produced by Bio::SeqIO::entrezgene
- Notes: One needs to have Bio::SeqIO::entrezgene installed before
+ Notes: One needs to have Bio::SeqIO::entrezgene installed before
calling this function!
-=cut
-
=head2 fetch_hash
Parameters: $geneid - id for the Entrez Gene record to be retrieved
@@ -141,19 +156,15 @@ sub _file_format
Gene record.
Notes: Alternative to fetch()
-=cut
+=head1 INTERNAL METHODS
-sub fetch_hash
-{
- my ($self, $geneid) = @_;
- if (my $gene = $self->db->{$geneid})
- {
- my ($fileno, $position) = $self->unpack_record($gene);
- my $parser = Bio::ASN1::EntrezGene->new('fh' => $self->_file_handle($fileno));
- seek($parser->fh, $position, 0);
- return $parser->next_seq;
- }
-}
+=head2 _version
+
+=head2 _type_stamp
+
+=head2 _index_file
+
+=head2 _file_format
=head2 _file_handle
@@ -171,21 +182,67 @@ sub fetch_hash
changes file handle code like I do below to fit perl 5.005_03, this
sub would be removed from this module
-=cut
+=head1 PREREQUISITE
-sub _file_handle {
- my( $self, $i ) = @_;
-
- unless ($self->{'_filehandle'}[$i]) {
- my @rec = $self->unpack_record($self->db->{"__FILE_$i"})
- or $self->throw("Can't get filename for index : $i");
- my $file = $rec[0];
- local *FH;
- open *FH, $file or $self->throw("Can't read file '$file' : $!");
- $self->{'_filehandle'}[$i] = *FH; # Cache filehandle
- }
- return $self->{'_filehandle'}[$i];
-}
+Bio::ASN1::EntrezGene, Bioperl version that contains Stefan Kirov's
+entrezgene.pm and all dependencies therein.
-1;
+=head1 INSTALLATION
+
+Same as Bio::ASN1::EntrezGene
+
+=head1 SEE ALSO
+
+For details on various parsers I generated for Entrez Gene, example scripts that
+uses/benchmarks the modules, please see L<http://sourceforge.net/projects/egparser/>.
+Those other parsers etc. are included in V1.05 download.
+=head1 CITATION
+
+Liu, Mingyi, and Andrei Grigoriev. "Fast parsers for Entrez Gene."
+Bioinformatics 21, no. 14 (2005): 3189-3190.
+
+=head1 OPERATION SYSTEMS SUPPORTED
+
+Any OS that Perl & Bioperl run on.
+
+=head1 FEEDBACK
+
+=head2 Mailing lists
+
+User feedback is an integral part of the evolution of this and other
+Bioperl modules. Send your comments and suggestions preferably to
+the Bioperl mailing list. Your participation is much appreciated.
+
+ bioperl-l@bioperl.org - General discussion
+ http://bioperl.org/wiki/Mailing_lists - About the mailing lists
+
+=head2 Support
+
+Please direct usage questions or support issues to the mailing list:
+I<bioperl-l@bioperl.org>
+
+rather than to the module maintainer directly. Many experienced and
+reponsive experts will be able look at the problem and quickly
+address it. Please include a thorough description of the problem
+with code and data examples if at all possible.
+
+=head2 Reporting bugs
+
+Report bugs to the Bioperl bug tracking system to help us keep track
+of the bugs and their resolution. Bug reports can be submitted via the
+web:
+
+ https://redmine.open-bio.org/projects/bioperl/
+
+=head1 AUTHOR
+
+Dr. Mingyi Liu <mingyiliu@gmail.com>
+
+=head1 COPYRIGHT
+
+This software is copyright (c) 2005 by Mingyi Liu, 2005 by GPC Biotech AG, and 2005 by Altana Research Institute.
+
+This software is available under the same terms as the perl 5 programming language system itself.
+
+=cut
@@ -1,135 +1,23 @@
-=head1 NAME
-
-Bio::ASN1::EntrezGene - Regular expression-based Perl Parser for NCBI Entrez Gene.
-
-=head1 SYNOPSIS
-
- use Bio::ASN1::EntrezGene;
-
- my $parser = Bio::ASN1::EntrezGene->new('file' => "Homo_sapiens");
- while(my $result = $parser->next_seq)
- {
- # extract data from $result, or Dumpvalue->new->dumpValue($result);
- }
-
- # a new way to get the $result data hash for a particular gene id:
- use Bio::ASN1::EntrezGene::Indexer;
- my $inx = Bio::ASN1::EntrezGene::Indexer->new(-filename => 'entrezgene.idx');
- my $seq = $inx->fetch_hash(10); # returns $result for Entrez Gene record
- # with geneid 10
- # note that the index file 'entrezgene.idx' can be created as follows
- my $inx = Bio::ASN1::EntrezGene::Indexer->new(
- -filename => 'entrezgene.idx',
- -write_flag => 'WRITE');
- $inx->make_index('Homo_sapiens', 'Mus_musculus'); # files come from NCBI download
-
- # for more detail please refer to Bio::ASN1::EntrezGene::Indexer perldoc
-
-=head1 PREREQUISITE
-
-None.
-
-=head1 INSTALLATION
-
-Bio::ASN1::EntrezGene package can be installed & tested as follows:
-
- perl Makefile.PL
- make
- make test
- make install
-
-=head1 DESCRIPTION
-
-Bio::ASN1::EntrezGene is a regular expression-based Perl Parser for NCBI Entrez
-Gene genome databases (L<http://www.ncbi.nih.gov/entrez/query.fcgi?db=gene>). It
-parses an ASN.1-formatted Entrez Gene record and returns a data structure that
-contains all data items from the gene record.
-
-The parser will report error & line number if input data does not conform to the
-NCBI Entrez Gene genome annotation file format.
-
-Note that it is possible to provide reading of all NCBI's ASN.1-formatted
-files through simple variations of the Entrez Gene parser (I need more
-investigation to be sure, but at least the sequence parser is a very simple
-variation on Entrez Gene parser and works well).
-
-It took the parser version 1.0 11 minutes to parse the human genome Entrez Gene
-file on one 2.4 GHz Intel Xeon processor. The addition of validation and error
-reporting in 1.03 and handling of new Entrez Gene format slowed the parser down
-about 40%.
-
-Since V1.07, this package also included an indexer that runs pretty fast (it
-takes 21 seconds for the indexer to index the human genome on the same
-processor). Therefore the combination of the modules would allow user to
-retrieve and parse arbitrary records.
-
-=head1 SEE ALSO
-
-The parse_entrez_gene_example.pl script included in this package (please
-see the Bio-ASN1-EntrezGene-x.xx/examples directory) is a very
-important and near-complete demo on using this module to extract all data
-items from Entrez Gene records. Do check it out because in fact, this
-script took me about 3-4 times more time to make for my project than the
-parser V1.0 itself. Note that the example script was edited to leave
-out stuff specific to my internal project.
-
-For details on various parsers I generated for Entrez Gene, example scripts that
-uses/benchmarks the modules, please see L<http://sourceforge.net/projects/egparser/>.
-Those other parsers etc. are included in V1.05 download.
-
-=head1 AUTHOR
-
-Dr. Mingyi Liu <mingyi.liu@gpc-biotech.com>
-
-=head1 COPYRIGHT
-
-The Bio::ASN1::EntrezGene module and its related modules and scripts
-are copyright (c) 2005 Mingyi Liu, GPC Biotech AG and Altana Research
-Institute. All rights reserved. I created these modules when working
-on a collaboration project between these two companies. Therefore a
-special thanks for the two companies to allow the release of the code
-into public domain.
-
-You may use and distribute them under the terms of the Perl itself or
-GPL (L<http://www.gnu.org/copyleft/gpl.html>).
-
-=head1 CITATION
-
-Liu, M and Grigoriev, A (2005) "Fast Parsers for Entrez Gene"
-Bioinformatics. In press
-
-=head1 OPERATION SYSTEMS SUPPORTED
-
-Any OS that Perl runs on.
-
-=head1 METHODS
-
-=cut
-
package Bio::ASN1::EntrezGene;
-
+BEGIN {
+ $Bio::ASN1::EntrezGene::AUTHORITY = 'cpan:BIOPERLML';
+}
+{
+ $Bio::ASN1::EntrezGene::VERSION = '1.70';
+}
+use utf8;
use strict;
+use warnings;
use Carp qw(carp croak);
-use vars qw ($VERSION);
-$VERSION = '1.09';
+# ABSTRACT: Regular expression-based Perl Parser for NCBI Entrez Gene.
+# AUTHOR: Dr. Mingyi Liu <mingyiliu@gmail.com>
+# OWNER: 2005 Mingyi Liu
+# OWNER: 2005 GPC Biotech AG
+# OWNER: 2005 Altana Research Institute
+# LICENSE: Perl_5
-=head2 new
- Parameters: maxerrstr => 20 (optional) - maximum number of characters after
- offending element, used by error reporting, default is 20
- file or -file => $filename (optional) - name of the file to be
- parsed. call next_seq to parse!
- fh or -fh => $filehandle (optional) - handle of the file to be
- parsed.
- Example: my $parser = Bio::ASN1::EntrezGene->new();
- Function: Instantiate a parser object
- Returns: Object reference
- Notes: Setting file or fh will reset line numbers etc. that are used
- for error reporting purposes, and seeking on file handle would
- mess up linenumbers!
-
-=cut
sub new
{
@@ -142,16 +30,6 @@ sub new
return $self;
}
-=head2 maxerrstr
-
- Parameters: $maxerrstr (optional) - maximum number of characters after
- offending element, used by error reporting, default is 20
- Example: $parser->maxerrstr(20);
- Function: get/set maxerrstr.
- Returns: maxerrstr.
- Notes:
-
-=cut
sub maxerrstr
{
@@ -161,28 +39,6 @@ sub maxerrstr
}
-=head2 parse
-
- Parameters: $string that contains Entrez Gene record,
- $trimopt (optional) that specifies how the data structure
- returned should be trimmed. 2 is recommended and
- default
- $noreset (optional) that species that line number should not
- be reset
- DEPRECATED as external function!!! Do not call this function
- directly! Call next_seq() instead
- Example: my $value = $parser->parse($text); # DEPRECATED as
- # external function!!! Do not call this function
- # directly! Call next_seq() instead
- Function: Takes in a string representing Entrez Gene record, parses
- the record and returns a data structure.
- Returns: A data structure containing all data items from the Entrez
- Gene record.
- Notes: DEPRECATED as external function!!! Do not call this function
- directly! Call next_seq() instead
- $string should not contain 'EntrezGene ::=' at beginning!
-
-=cut
sub parse
{
@@ -212,19 +68,6 @@ sub parse
return $result;
}
-=head2 input_file
-
- Parameters: $filename for file that contains Entrez Gene record(s)
- Example: $parser->input_file($filename);
- Function: Takes in name of a file containing Entrez Gene records.
- opens the file and stores file handle
- Returns: none.
- Notes: Attemps to open file larger than 2 GB even on Perl that
- does not support 2 GB file (accomplished by calling
- "cat" and piping output. On OS that does not have "cat"
- error message will be displayed)
-
-=cut
sub input_file
{
@@ -239,27 +82,6 @@ sub input_file
$self->{linenumber} = 0; # reset line number
}
-=head2 next_seq
-
- Parameters: $trimopt (optional) that specifies how the data structure
- returned should be trimmed. option 2 is recommended and
- default
- Example: my $value = $parser->next_seq();
- Function: Use the file handle generated by input_file, parses the next
- the record and returns a data structure.
- Returns: A data structure containing all data items from the Entrez
- Gene record.
- Notes: Must pass in a filename through new() or input_file() first!
- For details on how to use the $trimopt data trimming option
- please see comment for the trimdata method. An option
- of 2 is recommended and default
- The acceptable values for $trimopt include:
- 1 - trim as much as possibile
- 2 (or 0, undef) - trim to an easy-to-use structure
- 3 - no trimming (in version 1.06, prior to version
- 1.06, 0 or undef means no trimming)
-
-=cut
sub next_seq
{
@@ -270,13 +92,12 @@ sub next_seq
{
chomp;
next unless /\S/;
- my $tmp = (/^\s*Entrezgene ::= ({.*)/si)? $1 : "{" . $_; # get rid of the 'Entrezgene ::= ' at the beginning of Entrez Gene record
+ my $tmp = (/^\s*Entrezgene(-Set)? ::= ({.*)/si)? $2 : "{" . $_; # get rid of the 'Entrezgene ::= ' at the beginning of Entrez Gene record
return $self->parse($tmp, $compact, 1); # 1 species no resetting line number
}
}
-# NCBI's Apr 05, 2005 format change forced much usage of lookahead, which would for
-# sure slows parser down. But can't code efficiently without it.
+
sub _parse
{
my ($self, $flag) = @_;
@@ -310,7 +131,7 @@ sub _parse
$self->{linenumber} += $lines =~ s/\n//g || $lines =~ s/\r//g; # count by *NIX/Win or Mac
my $tmp;
# we put \s* in lookahead for linenumber counting purpose (which slows things down)
- if(($self->{input} =~ /\G"((?:[^"]|"")*)"(?=\s*[,}])/cg && ++$tmp) ||
+ if(($self->{input} =~ /\G"((?:[^"]+|"")*)"(?=\s*[,}])/cg && ++$tmp) ||
$self->{input} =~ /\G([\w-]+)(?=\s*[,}])/cg)
{
my $value = $1;
@@ -344,7 +165,7 @@ sub _parse
return $data;
}
}
- elsif($self->{input} =~ /\G[ \t]*"((?:[^"]|"")*)"(?=\s*[,}])/cg)
+ elsif($self->{input} =~ /\G[ \t]*"((?:[^"]+|"")*)"(?=\s*[,}])/cg)
{
my $value = $1;
$value =~ s/""/"/g;
@@ -408,7 +229,7 @@ sub _parse
# so now $hash->{comments}->[0]->[0]->[0]->{source}->[0]->[0]->[0]->{src}->[0]->[0]->{tag}->[0]->{id}
# becomes $hash->{comments}->[0]->{source}->{src}->{tag}->{id}
# this may create problem as array might suddenly change to hash depending on whether it
-# has multiple elements or not. So set $flag to 2 or 0/undef would disallow trimming that
+# has multiple elements or not. So set $flag to 2 or 0/undef would disallow trimming that
# would lead to data type change, thus resulting in data structure like:
# 'comments' => ARRAY(0x88617e8)
# 0 HASH(0x889d578)
@@ -423,25 +244,6 @@ sub _parse
# 'id' => 5
# still not the safest, but saves some hassle writing code
-=head2 trimdata
-
- Parameters: $hashref or $arrayref
- $trimflag (optional, see Notes)
- Example: trimdata($datahash); # using the default flag
- Function: recursively process all attributes of a hash/array
- hybrid and get rid of any arrayref that points to
- one-element arrays (trims data structure) depending on
- the optional flag.
- Returns: none - trimming happenes in-place
- Notes: This function is useful to compact a data structure produced by
- Bio::ASN1::EntrezGene::parse.
- The acceptable values for $trimopt include:
- 1 - trim as much as possibile
- 2 (or 0, undef) - trim to an easy-to-use structure
- 3 - no trimming (in version 1.06, prior to version
- 1.06, 0 or undef means no trimming)
-
-=cut
sub trimdata
{
@@ -483,17 +285,6 @@ sub trimdata
}
}
-=head2 fh
-
- Parameters: $filehandle (optional)
- Example: trimdata($datahash); # using the default flag
- Function: getter/setter for file handle
- Returns: file handle for current file being parsed.
- Notes: Use with care!
- Line number report would not be corresponding to file's line
- number if seek operation is performed on the file handle!
-
-=cut
sub fh
{
@@ -506,6 +297,186 @@ sub fh
return $self->{fh};
}
+
+sub rawdata
+{
+ my $self = shift;
+ return "Entrezgene ::= $self->{input}";
+}
+
+1;
+
+__END__
+
+=pod
+
+=encoding utf-8
+
+=head1 NAME
+
+Bio::ASN1::EntrezGene - Regular expression-based Perl Parser for NCBI Entrez Gene.
+
+=head1 VERSION
+
+version 1.70
+
+=head1 SYNOPSIS
+
+ use Bio::ASN1::EntrezGene;
+
+ my $parser = Bio::ASN1::EntrezGene->new('file' => "Homo_sapiens");
+ while(my $result = $parser->next_seq)
+ {
+ # extract data from $result, or Dumpvalue->new->dumpValue($result);
+ }
+
+ # a new way to get the $result data hash for a particular gene id:
+ use Bio::ASN1::EntrezGene::Indexer;
+ my $inx = Bio::ASN1::EntrezGene::Indexer->new(-filename => 'entrezgene.idx');
+ my $seq = $inx->fetch_hash(10); # returns $result for Entrez Gene record
+ # with geneid 10
+ # note that the index file 'entrezgene.idx' can be created as follows
+ my $inx = Bio::ASN1::EntrezGene::Indexer->new(
+ -filename => 'entrezgene.idx',
+ -write_flag => 'WRITE');
+ $inx->make_index('Homo_sapiens', 'Mus_musculus'); # files come from NCBI download
+
+ # for more detail please refer to Bio::ASN1::EntrezGene::Indexer perldoc
+
+=head1 DESCRIPTION
+
+Bio::ASN1::EntrezGene is a regular expression-based Perl Parser for NCBI Entrez
+Gene genome databases (L<http://www.ncbi.nih.gov/entrez/query.fcgi?db=gene>). It
+parses an ASN.1-formatted Entrez Gene record and returns a data structure that
+contains all data items from the gene record.
+
+The parser will report error & line number if input data does not conform to the
+NCBI Entrez Gene genome annotation file format.
+
+Note that it is possible to provide reading of all NCBI's ASN.1-formatted
+files through simple variations of the Entrez Gene parser (I need more
+investigation to be sure, but at least the sequence parser is a very simple
+variation on Entrez Gene parser and works well).
+
+It took the parser version 1.0 11 minutes to parse the human genome Entrez Gene
+file on one 2.4 GHz Intel Xeon processor. The addition of validation and error
+reporting in 1.03 and handling of new Entrez Gene format slowed the parser down
+about 40%.
+
+Since V1.07, this package also included an indexer that runs pretty fast (it
+takes 21 seconds for the indexer to index the human genome on the same
+processor). Therefore the combination of the modules would allow user to
+retrieve and parse arbitrary records.
+
+=head1 ATTRIBUTES
+
+=head2 maxerrstr
+
+ Parameters: $maxerrstr (optional) - maximum number of characters after
+ offending element, used by error reporting, default is 20
+ Example: $parser->maxerrstr(20);
+ Function: get/set maxerrstr.
+ Returns: maxerrstr.
+ Notes:
+
+=head2 input_file
+
+ Parameters: $filename for file that contains Entrez Gene record(s)
+ Example: $parser->input_file($filename);
+ Function: Takes in name of a file containing Entrez Gene records.
+ opens the file and stores file handle
+ Returns: none.
+ Notes: Attempts to open file larger than 2 GB even on Perl that
+ does not support 2 GB file (accomplished by calling
+ "cat" and piping output. On OS that does not have "cat"
+ error message will be displayed)
+
+=head1 METHODS
+
+=head2 new
+
+ Parameters: maxerrstr => 20 (optional) - maximum number of characters after
+ offending element, used by error reporting, default is 20
+ file or -file => $filename (optional) - name of the file to be
+ parsed. call next_seq to parse!
+ fh or -fh => $filehandle (optional) - handle of the file to be
+ parsed.
+ Example: my $parser = Bio::ASN1::EntrezGene->new();
+ Function: Instantiate a parser object
+ Returns: Object reference
+ Notes: Setting file or fh will reset line numbers etc. that are used
+ for error reporting purposes, and seeking on file handle would
+ mess up linenumbers!
+
+=head2 parse
+
+ Parameters: $string that contains Entrez Gene record,
+ $trimopt (optional) that specifies how the data structure
+ returned should be trimmed. 2 is recommended and
+ default
+ $noreset (optional) that species that line number should not
+ be reset
+ DEPRECATED as external function!!! Do not call this function
+ directly! Call next_seq() instead
+ Example: my $value = $parser->parse($text); # DEPRECATED as
+ # external function!!! Do not call this function
+ # directly! Call next_seq() instead
+ Function: Takes in a string representing Entrez Gene record, parses
+ the record and returns a data structure.
+ Returns: A data structure containing all data items from the Entrez
+ Gene record.
+ Notes: DEPRECATED as external function!!! Do not call this function
+ directly! Call next_seq() instead
+ $string should not contain 'EntrezGene ::=' at beginning!
+
+=head2 next_seq
+
+ Parameters: $trimopt (optional) that specifies how the data structure
+ returned should be trimmed. option 2 is recommended and
+ default
+ Example: my $value = $parser->next_seq();
+ Function: Use the file handle generated by input_file, parses the next
+ the record and returns a data structure.
+ Returns: A data structure containing all data items from the Entrez
+ Gene record.
+ Notes: Must pass in a filename through new() or input_file() first!
+ For details on how to use the $trimopt data trimming option
+ please see comment for the trimdata method. An option
+ of 2 is recommended and default
+ The acceptable values for $trimopt include:
+ 1 - trim as much as possibile
+ 2 (or 0, undef) - trim to an easy-to-use structure
+ 3 - no trimming (in version 1.06, prior to version
+ 1.06, 0 or undef means no trimming)
+
+=head2 trimdata
+
+ Parameters: $hashref or $arrayref
+ $trimflag (optional, see Notes)
+ Example: trimdata($datahash); # using the default flag
+ Function: recursively process all attributes of a hash/array
+ hybrid and get rid of any arrayref that points to
+ one-element arrays (trims data structure) depending on
+ the optional flag.
+ Returns: none - trimming happenes in-place
+ Notes: This function is useful to compact a data structure produced by
+ Bio::ASN1::EntrezGene::parse.
+ The acceptable values for $trimopt include:
+ 1 - trim as much as possibile
+ 2 (or 0, undef) - trim to an easy-to-use structure
+ 3 - no trimming (in version 1.06, prior to version
+ 1.06, 0 or undef means no trimming)
+
+=head2 fh
+
+ Parameters: $filehandle (optional)
+ Example: trimdata($datahash); # using the default flag
+ Function: getter/setter for file handle
+ Returns: file handle for current file being parsed.
+ Notes: Use with care!
+ Line number report would not be corresponding to file's line
+ number if seek operation is performed on the file handle!
+
=head2 rawdata
Parameters: none
@@ -514,16 +485,89 @@ sub fh
Returns: a string containing the ASN1-formatted Entrez Gene record
Notes: Must first parse a record then call this function!
Could be useful in interpreting line number value in error
- report (if user did a seek on file handle right before parsing
+ report (if user did a seek on file handle right before parsing
call)
-=cut
+=head1 INTERNAL METHODS
-sub rawdata
-{
- my $self = shift;
- return "Entrezgene ::= $self->{input}";
-}
+=head2 _parse
-1;
+NCBI's Apr 05, 2005 format change forced much usage of lookahead, which would for
+sure slows parser down. But can't code efficiently without it.
+
+=head1 PREREQUISITE
+
+None.
+
+=head1 INSTALLATION
+
+Bio::ASN1::EntrezGene package can be installed & tested as follows:
+
+ perl Makefile.PL
+ make
+ make test
+ make install
+
+=head1 SEE ALSO
+
+The parse_entrez_gene_example.pl script included in this package (please
+see the Bio-ASN1-EntrezGene-x.xx/examples directory) is a very
+important and near-complete demo on using this module to extract all data
+items from Entrez Gene records. Do check it out because in fact, this
+script took me about 3-4 times more time to make for my project than the
+parser V1.0 itself. Note that the example script was edited to leave
+out stuff specific to my internal project.
+
+For details on various parsers I generated for Entrez Gene, example scripts that
+uses/benchmarks the modules, please see L<http://sourceforge.net/projects/egparser/>.
+Those other parsers etc. are included in V1.05 download.
+
+=head1 CITATION
+
+Liu, Mingyi, and Andrei Grigoriev. "Fast parsers for Entrez Gene."
+Bioinformatics 21, no. 14 (2005): 3189-3190.
+
+=head1 OPERATION SYSTEMS SUPPORTED
+
+Any OS that Perl runs on.
+
+=head1 FEEDBACK
+
+=head2 Mailing lists
+
+User feedback is an integral part of the evolution of this and other
+Bioperl modules. Send your comments and suggestions preferably to
+the Bioperl mailing list. Your participation is much appreciated.
+
+ bioperl-l@bioperl.org - General discussion
+ http://bioperl.org/wiki/Mailing_lists - About the mailing lists
+
+=head2 Support
+Please direct usage questions or support issues to the mailing list:
+I<bioperl-l@bioperl.org>
+
+rather than to the module maintainer directly. Many experienced and
+reponsive experts will be able look at the problem and quickly
+address it. Please include a thorough description of the problem
+with code and data examples if at all possible.
+
+=head2 Reporting bugs
+
+Report bugs to the Bioperl bug tracking system to help us keep track
+of the bugs and their resolution. Bug reports can be submitted via the
+web:
+
+ https://redmine.open-bio.org/projects/bioperl/
+
+=head1 AUTHOR
+
+Dr. Mingyi Liu <mingyiliu@gmail.com>
+
+=head1 COPYRIGHT
+
+This software is copyright (c) 2005 by Mingyi Liu, 2005 by GPC Biotech AG, and 2005 by Altana Research Institute.
+
+This software is available under the same terms as the perl 5 programming language system itself.
+
+=cut
@@ -1,97 +1,41 @@
-=head1 NAME
-
-Bio::ASN1::Sequence::Indexer - Indexes NCBI Sequence files.
-
-=head1 SYNOPSIS
-
- use Bio::ASN1::Sequence::Indexer;
-
- # creating & using the index is just a few lines
- my $inx = Bio::ASN1::Sequence::Indexer->new(
- -filename => 'seq.idx',
- -write_flag => 'WRITE'); # needed for make_index call, but if opening
- # existing index file, don't set write flag!
- $inx->make_index('seq1.asn', 'seq2.asn');
- my $seq = $inx->fetch('AF093062'); # Bio::Seq obj for Sequence (doesn't work yet)
- # alternatively, if one prefers just a data structure instead of objects
- $seq = $inx->fetch_hash('AF093062'); # a hash produced by Bio::ASN1::Sequence
- # that contains all data in the Sequence record
-
-=head1 PREREQUISITE
-
-Bio::ASN1::Sequence, Bioperl and all dependencies therein.
-
-=head1 INSTALLATION
-
-Same as Bio::ASN1::EntrezGene
-
-=head1 DESCRIPTION
-
-Bio::ASN1::Sequence::Indexer is a Perl Indexer for NCBI Sequence genome
-databases. It processes an ASN.1-formatted Sequence record and stores the
-file position for each record in a way compliant with Bioperl standard (in
-fact its a subclass of Bioperl's index objects).
-
-Note that this module does not parse record, because it needs to run fast and
-grab only the gene ids. For parsing record, use Bio::ASN1::Sequence.
-
-As with Bio::ASN1::Sequence, this module is best thought of as beta version -
-it works, but is not fully tested.
-
-=head1 SEE ALSO
-
-Please check out perldoc for Bio::ASN1::EntrezGene for more info.
-
-=head1 AUTHOR
-
-Dr. Mingyi Liu <mingyi.liu@gpc-biotech.com>
-
-=head1 COPYRIGHT
-
-The Bio::ASN1::EntrezGene module and its related modules and scripts
-are copyright (c) 2005 Mingyi Liu, GPC Biotech AG and Altana Research
-Institute. All rights reserved. I created these modules when working
-on a collaboration project between these two companies. Therefore a
-special thanks for the two companies to allow the release of the code
-into public domain.
-
-You may use and distribute them under the terms of the Perl itself or
-GPL (L<http://www.gnu.org/copyleft/gpl.html>).
-
-=head1 CITATION
-
-Liu, M and Grigoriev, A (2005) "Fast Parsers for Entrez Gene"
-Bioinformatics. In press
-
-=head1 OPERATION SYSTEMS SUPPORTED
-
-Any OS that Perl & Bioperl run on.
-
-=head1 METHODS
-
-=cut
-
package Bio::ASN1::Sequence::Indexer;
-
+BEGIN {
+ $Bio::ASN1::Sequence::Indexer::AUTHORITY = 'cpan:BIOPERLML';
+}
+{
+ $Bio::ASN1::Sequence::Indexer::VERSION = '1.70';
+}
+use utf8;
use strict;
+use warnings;
use Carp qw(carp croak);
-use vars qw ($VERSION @ISA);
use Bio::ASN1::Sequence;
use Bio::Index::AbstractSeq;
+use parent qw(Bio::Index::AbstractSeq);
+
+# ABSTRACT: Indexes NCBI Sequence files.
+# AUTHOR: Dr. Mingyi Liu <mingyiliu@gmail.com>
+# OWNER: 2005 Mingyi Liu
+# OWNER: 2005 GPC Biotech AG
+# OWNER: 2005 Altana Research Institute
+# LICENSE: Perl_5
+
-@ISA = qw(Bio::Index::AbstractSeq);
-$VERSION = '1.09';
+
+# TODO: Should this be deprecated?
sub _version
{
- return $VERSION;
+ return $Bio::Index::AbstractSeq::VERSION;
}
+
sub _type_stamp
{
return '__Sequence_ASN1__';
}
+
sub _index_file
{
my($self, $file, $idx) = @_;
@@ -111,11 +55,87 @@ sub _index_file
return 1;
}
+
sub _file_format
{
return 'sequence';
}
+
+
+sub fetch_hash
+{
+ my ($self, $seqid) = @_;
+ if (my $seq = $self->db->{$seqid})
+ {
+ my ($fileno, $position) = $self->unpack_record($seq);
+ my $parser = Bio::ASN1::Sequence->new('fh' => $self->_file_handle($fileno));
+ seek($parser->fh, $position, 0);
+ return $parser->next_seq;
+ }
+}
+
+
+sub _file_handle {
+ my( $self, $i ) = @_;
+
+ unless ($self->{'_filehandle'}[$i]) {
+ my @rec = $self->unpack_record($self->db->{"__FILE_$i"})
+ or $self->throw("Can't get filename for index : $i");
+ my $file = $rec[0];
+ local *FH;
+ open *FH, $file or $self->throw("Can't read file '$file' : $!");
+ $self->{'_filehandle'}[$i] = *FH; # Cache filehandle
+ }
+ return $self->{'_filehandle'}[$i];
+}
+
+1;
+
+__END__
+
+=pod
+
+=encoding utf-8
+
+=head1 NAME
+
+Bio::ASN1::Sequence::Indexer - Indexes NCBI Sequence files.
+
+=head1 VERSION
+
+version 1.70
+
+=head1 SYNOPSIS
+
+ use Bio::ASN1::Sequence::Indexer;
+
+ # creating & using the index is just a few lines
+ my $inx = Bio::ASN1::Sequence::Indexer->new(
+ -filename => 'seq.idx',
+ -write_flag => 'WRITE'); # needed for make_index call, but if opening
+ # existing index file, don't set write flag!
+ $inx->make_index('seq1.asn', 'seq2.asn');
+ my $seq = $inx->fetch('AF093062'); # Bio::Seq obj for Sequence (doesn't work yet)
+ # alternatively, if one prefers just a data structure instead of objects
+ $seq = $inx->fetch_hash('AF093062'); # a hash produced by Bio::ASN1::Sequence
+ # that contains all data in the Sequence record
+
+=head1 DESCRIPTION
+
+Bio::ASN1::Sequence::Indexer is a Perl Indexer for NCBI Sequence genome
+databases. It processes an ASN.1-formatted Sequence record and stores the
+file position for each record in a way compliant with Bioperl standard (in
+fact its a subclass of Bioperl's index objects).
+
+Note that this module does not parse record, because it needs to run fast and
+grab only the gene ids. For parsing record, use Bio::ASN1::Sequence.
+
+As with Bio::ASN1::Sequence, this module is best thought of as beta version -
+it works, but is not fully tested.
+
+=head1 METHODS
+
=head2 fetch
Parameters: $geneid - id for the Sequence record to be retrieved
@@ -125,8 +145,6 @@ sub _file_format
Notes: Bio::SeqIO::sequence does not exist and probably won't
exist for a while! So call fetch_hash instead
-=cut
-
=head2 fetch_hash
Parameters: $seqid - id for the Sequence record to be retrieved
@@ -136,19 +154,15 @@ sub _file_format
record.
Notes: Alternative to fetch()
-=cut
+=head1 INTERNAL METHODS
-sub fetch_hash
-{
- my ($self, $seqid) = @_;
- if (my $seq = $self->db->{$seqid})
- {
- my ($fileno, $position) = $self->unpack_record($seq);
- my $parser = Bio::ASN1::Sequence->new('fh' => $self->_file_handle($fileno));
- seek($parser->fh, $position, 0);
- return $parser->next_seq;
- }
-}
+=head2 _version
+
+=head2 _type_stamp
+
+=head2 _index_file
+
+=head2 _file_format
=head2 _file_handle
@@ -166,21 +180,64 @@ sub fetch_hash
changes file handle code like I do below to fit perl 5.005_03, this
sub would be removed from this module
-=cut
+=head1 PREREQUISITE
-sub _file_handle {
- my( $self, $i ) = @_;
-
- unless ($self->{'_filehandle'}[$i]) {
- my @rec = $self->unpack_record($self->db->{"__FILE_$i"})
- or $self->throw("Can't get filename for index : $i");
- my $file = $rec[0];
- local *FH;
- open *FH, $file or $self->throw("Can't read file '$file' : $!");
- $self->{'_filehandle'}[$i] = *FH; # Cache filehandle
- }
- return $self->{'_filehandle'}[$i];
-}
+Bio::ASN1::Sequence, Bioperl and all dependencies therein.
-1;
+=head1 INSTALLATION
+
+Same as Bio::ASN1::EntrezGene
+
+=head1 SEE ALSO
+
+Please check out perldoc for Bio::ASN1::EntrezGene for more info.
+
+=head1 CITATION
+Liu, Mingyi, and Andrei Grigoriev. "Fast parsers for Entrez Gene."
+Bioinformatics 21, no. 14 (2005): 3189-3190.
+
+=head1 OPERATION SYSTEMS SUPPORTED
+
+Any OS that Perl & Bioperl run on.
+
+=head1 FEEDBACK
+
+=head2 Mailing lists
+
+User feedback is an integral part of the evolution of this and other
+Bioperl modules. Send your comments and suggestions preferably to
+the Bioperl mailing list. Your participation is much appreciated.
+
+ bioperl-l@bioperl.org - General discussion
+ http://bioperl.org/wiki/Mailing_lists - About the mailing lists
+
+=head2 Support
+
+Please direct usage questions or support issues to the mailing list:
+I<bioperl-l@bioperl.org>
+
+rather than to the module maintainer directly. Many experienced and
+reponsive experts will be able look at the problem and quickly
+address it. Please include a thorough description of the problem
+with code and data examples if at all possible.
+
+=head2 Reporting bugs
+
+Report bugs to the Bioperl bug tracking system to help us keep track
+of the bugs and their resolution. Bug reports can be submitted via the
+web:
+
+ https://redmine.open-bio.org/projects/bioperl/
+
+=head1 AUTHOR
+
+Dr. Mingyi Liu <mingyiliu@gmail.com>
+
+=head1 COPYRIGHT
+
+This software is copyright (c) 2005 by Mingyi Liu, 2005 by GPC Biotech AG, and 2005 by Altana Research Institute.
+
+This software is available under the same terms as the perl 5 programming language system itself.
+
+=cut
@@ -1,121 +1,23 @@
-=head1 NAME
-
-Bio::ASN1::Sequence - Regular expression-based Perl Parser for ASN.1-formatted NCBI Sequences.
-
-=head1 SYNOPSIS
-
- use Bio::ASN1::Sequence;
-
- my $parser = Bio::ASN1::Sequence->new('file' => "downloaded.asn1");
- while(my $result = $parser->next_seq)
- {
- # extract data from $result, or Dumpvalue->new->dumpValue($result);
- }
-
- # a new way to get the $result data hash for a particular sequence id:
- use Bio::ASN1::Sequence::Indexer;
- my $inx = Bio::ASN1::Sequence::Indexer->new(-filename => 'seq.idx');
- my $seq = $inx->fetch_hash('AF093062');
-
- # for creation of .idx index files please refer to
- # Bio::ASN1::Sequence::Indexer perldoc
-
-=head1 PREREQUISITE
-
-None.
-
-=head1 INSTALLATION
-
-Bio::ASN1::Sequence is part of the Bio::ASN1::EntrezGene package.
-Bio::ASN1::EntrezGene package can be installed & tested as follows:
-
- perl Makefile.PL
- make
- make test
- make install
-
-=head1 DESCRIPTION
-
-Bio::ASN1::Sequence is a regular expression-based Perl Parser for ASN.1-formatted
-NCBI sequences. It parses an ASN.1-formatted sequence record and returns a data
-structure that contains all data items from the sequence record.
-
-The parser will report error & line number if input data does not conform to the
-NCBI Sequence annotation file format.
-
-The sequence parser is basically a modified version of the high-performance
-Bio::ASN1::EntrezGene parser. However, I created a standalone module for sequence
-since it is more efficient to keep Sequence-specific code out of EntrezGene.pm.
-
-In fact it is possible to provide reading of all NCBI's ASN.1-formatted
-files through simple variations of the Entrez Gene parser (I need more
-investigation to be sure, but at least the sequence parser works well).
-
-Since demand for parsing NCBI ASN.1-formatted sequences is much lower than EntrezGene,
-this module is more like a beta version that works on the examples I checked, but
-I did not check all available records or data definitions. The error-reporting
-function of this module has to be useful sometimes. :)
-
-=head1 SEE ALSO
-
-The parse_sequence_example.pl script included in this package (please
-see the Bio-ASN1-EntrezGene-x.xx/examples directory) shows the usage.
-
-Please check out perldoc for Bio::ASN1::EntrezGene for more info.
-
-=head1 AUTHOR
-
-Dr. Mingyi Liu <mingyi.liu@gpc-biotech.com>
-
-=head1 COPYRIGHT
-
-The Bio::ASN1::EntrezGene module and its related modules and scripts
-are copyright (c) 2005 Mingyi Liu, GPC Biotech AG and Altana Research
-Institute. All rights reserved. I created these modules when working
-on a collaboration project between these two companies. Therefore a
-special thanks for the two companies to allow the release of the code
-into public domain.
-
-You may use and distribute them under the terms of the Perl itself or
-GPL (L<http://www.gnu.org/copyleft/gpl.html>).
-
-=head1 CITATION
-
-Liu, M and Grigoriev, A (2005) "Fast Parsers for Entrez Gene"
-Bioinformatics. In press
-
-=head1 OPERATION SYSTEMS SUPPORTED
-
-Any OS that Perl runs on.
-
-=head1 METHODS
-
-=cut
-
package Bio::ASN1::Sequence;
-
+BEGIN {
+ $Bio::ASN1::Sequence::AUTHORITY = 'cpan:BIOPERLML';
+}
+{
+ $Bio::ASN1::Sequence::VERSION = '1.70';
+}
+use utf8;
use strict;
+use warnings;
use Carp qw(carp croak);
-use vars qw ($VERSION);
-$VERSION = '1.09';
+# ABSTRACT: Regular expression-based Perl Parser for ASN.1-formatted NCBI Sequences.
+# AUTHOR: Dr. Mingyi Liu <mingyiliu@gmail.com>
+# OWNER: 2005 Mingyi Liu
+# OWNER: 2005 GPC Biotech AG
+# OWNER: 2005 Altana Research Institute
+# LICENSE: Perl_5
-=head2 new
- Parameters: maxerrstr => 20 (optional) - maximum number of characters after
- offending element, used by error reporting, default is 20
- file or -file => $filename (optional) - name of the file to be
- parsed. call next_seq to parse!
- fh or -fh => $filehandle (optional) - handle of the file to be
- parsed.
- Example: my $parser = Bio::ASN1::Sequence->new();
- Function: Instantiate a parser object
- Returns: Object reference
- Notes: Setting file or fh will reset line numbers etc. that are used
- for error reporting purposes, and seeking on file handle would
- mess up linenumbers!
-
-=cut
sub new
{
@@ -128,16 +30,6 @@ sub new
return $self;
}
-=head2 maxerrstr
-
- Parameters: $maxerrstr (optional) - maximum number of characters after
- offending element, used by error reporting, default is 20
- Example: $parser->maxerrstr(20);
- Function: get/set maxerrstr.
- Returns: maxerrstr.
- Notes:
-
-=cut
sub maxerrstr
{
@@ -147,28 +39,6 @@ sub maxerrstr
}
-=head2 parse
-
- Parameters: $string that contains Sequence record,
- $trimopt (optional) that specifies how the data structure
- returned should be trimmed. 2 is recommended and
- default
- $noreset (optional) that species that line number should not
- be reset
- DEPRECATED as external function!!! Do not call this function
- directly! Call next_seq() instead
- Example: my $value = $parser->parse($text); # DEPRECATED as
- # external function!!! Do not call this function
- # directly! Call next_seq() instead
- Function: Takes in a string representing Sequence record, parses
- the record and returns a data structure.
- Returns: A data structure containing all data items from the sequence
- record.
- Notes: DEPRECATED as external function!!! Do not call this function
- directly! Call next_seq() instead
- $string should not contain 'Seq-entry ::= set' at beginning!
-
-=cut
sub parse
{
@@ -198,19 +68,6 @@ sub parse
return $result;
}
-=head2 input_file
-
- Parameters: $filename for file that contains Sequence record(s)
- Example: $parser->input_file($filename);
- Function: Takes in name of a file containing Sequence records.
- opens the file and stores file handle
- Returns: none.
- Notes: Attemps to open file larger than 2 GB even on Perl that
- does not support 2 GB file (accomplished by calling
- "cat" and piping output. On OS that does not have "cat"
- error message will be displayed)
-
-=cut
sub input_file
{
@@ -225,27 +82,6 @@ sub input_file
$self->{linenumber} = 0; # reset line number
}
-=head2 next_seq
-
- Parameters: $trimopt (optional) that specifies how the data structure
- returned should be trimmed. option 2 is recommended and
- default
- Example: my $value = $parser->next_seq();
- Function: Use the file handle generated by input_file, parses the next
- the record and returns a data structure.
- Returns: A data structure containing all data items from the sequence
- record.
- Notes: Must pass in a filename through new() or input_file() first!
- For details on how to use the $trimopt data trimming option
- please see comment for the trimdata method. An option
- of 2 is recommended and default
- The acceptable values for $trimopt include:
- 1 - trim as much as possibile
- 2 (or 0, undef) - trim to an easy-to-use structure
- 3 - no trimming (in version 1.06, prior to version
- 1.06, 0 or undef means no trimming)
-
-=cut
sub next_seq
{
@@ -261,8 +97,7 @@ sub next_seq
}
}
-# NCBI's Apr 05, 2005 format change forced much usage of lookahead, which would for
-# sure slows parser down. But can't code efficiently without it.
+
sub _parse
{
my ($self, $flag) = @_;
@@ -296,7 +131,7 @@ sub _parse
$self->{linenumber} += $lines =~ s/\n//g || $lines =~ s/\r//g; # count by *NIX/Win or Mac
my ($tmp, $tmp1);
# we put \s* in lookahead for linenumber counting purpose (which slows things down)
- if(($self->{input} =~ /\G"((?:[^"]|"")*)"(?=\s*[,}])/cg && ++$tmp) ||
+ if(($self->{input} =~ /\G"((?:[^"]+|"")*)"(?=\s*[,}])/cg && ++$tmp) ||
($self->{input} =~ /\G'([^']+)'\s*H/icg && ++$tmp1) || # this is the only difference b/w sequence and entrez gene formats so far
$self->{input} =~ /\G([\w-]+)(?=\s*[,}])/cg)
{
@@ -337,7 +172,7 @@ sub _parse
return $data;
}
}
- elsif($self->{input} =~ /\G[ \t]*"((?:[^"]|"")*)"(?=\s*[,}])/cg)
+ elsif($self->{input} =~ /\G[ \t]*"((?:[^"]+|"")*)"(?=\s*[,}])/cg)
{
my $value = $1;
$value =~ s/""/"/g;
@@ -401,7 +236,7 @@ sub _parse
# so now $hash->{comments}->[0]->[0]->[0]->{source}->[0]->[0]->[0]->{src}->[0]->[0]->{tag}->[0]->{id}
# becomes $hash->{comments}->[0]->{source}->{src}->{tag}->{id}
# this may create problem as array might suddenly change to hash depending on whether it
-# has multiple elements or not. So set $flag to 2 or 0/undef would disallow trimming that
+# has multiple elements or not. So set $flag to 2 or 0/undef would disallow trimming that
# would lead to data type change, thus resulting in data structure like:
# 'comments' => ARRAY(0x88617e8)
# 0 HASH(0x889d578)
@@ -416,27 +251,6 @@ sub _parse
# 'id' => 5
# still not the safest, but saves some hassle writing code
-=head2 trimdata
-
- Parameters: $hashref or $arrayref
- $trimflag (optional, see Notes)
- Example: trimdata($datahash); # using the default flag
- Function: recursively process all attributes of a hash/array
- hybrid and get rid of any arrayref that points to
- one-element arrays (trims data structure) depending on
- the optional flag.
- Returns: none - trimming happenes in-place
- Notes: This function is useful to compact a data structure produced by
- Bio::ASN1::Sequence::parse.
- The acceptable values for $trimopt include:
- 1 - trim as much as possibile
- 2 (or 0, undef) - trim to an easy-to-use structure
- 3 - no trimming (in version 1.06, prior to version
- 1.06, 0 or undef means no trimming)
- This function is duplicate to EntrezGene.pm's and code should
- be compressed in the future (using util module & subclass).
-
-=cut
sub trimdata
{
@@ -478,17 +292,6 @@ sub trimdata
}
}
-=head2 fh
-
- Parameters: $filehandle (optional)
- Example: trimdata($datahash); # using the default flag
- Function: getter/setter for file handle
- Returns: file handle for current file being parsed.
- Notes: Use with care!
- Line number report would not be corresponding to file's line
- number if seek operation is performed on the file handle!
-
-=cut
sub fh
{
@@ -501,6 +304,180 @@ sub fh
return $self->{fh};
}
+
+sub rawdata
+{
+ my $self = shift;
+ return "Seq-entry ::= set $self->{input}";
+}
+
+1;
+
+__END__
+
+=pod
+
+=encoding utf-8
+
+=head1 NAME
+
+Bio::ASN1::Sequence - Regular expression-based Perl Parser for ASN.1-formatted NCBI Sequences.
+
+=head1 VERSION
+
+version 1.70
+
+=head1 SYNOPSIS
+
+ use Bio::ASN1::Sequence;
+
+ my $parser = Bio::ASN1::Sequence->new('file' => "downloaded.asn1");
+ while(my $result = $parser->next_seq)
+ {
+ # extract data from $result, or Dumpvalue->new->dumpValue($result);
+ }
+
+ # a new way to get the $result data hash for a particular sequence id:
+ use Bio::ASN1::Sequence::Indexer;
+ my $inx = Bio::ASN1::Sequence::Indexer->new(-filename => 'seq.idx');
+ my $seq = $inx->fetch_hash('AF093062');
+
+ # for creation of .idx index files please refer to
+ # Bio::ASN1::Sequence::Indexer perldoc
+
+=head1 DESCRIPTION
+
+Bio::ASN1::Sequence is a regular expression-based Perl Parser for ASN.1-formatted
+NCBI sequences. It parses an ASN.1-formatted sequence record and returns a data
+structure that contains all data items from the sequence record.
+
+The parser will report error & line number if input data does not conform to the
+NCBI Sequence annotation file format.
+
+The sequence parser is basically a modified version of the high-performance
+Bio::ASN1::EntrezGene parser. However, I created a standalone module for sequence
+since it is more efficient to keep Sequence-specific code out of EntrezGene.pm.
+
+In fact it is possible to provide reading of all NCBI's ASN.1-formatted
+files through simple variations of the Entrez Gene parser (I need more
+investigation to be sure, but at least the sequence parser works well).
+
+Since demand for parsing NCBI ASN.1-formatted sequences is much lower than EntrezGene,
+this module is more like a beta version that works on the examples I checked, but
+I did not check all available records or data definitions. The error-reporting
+function of this module has to be useful sometimes. :)
+
+=head1 ATTRIBUTES
+
+=head2 maxerrstr
+
+ Parameters: $maxerrstr (optional) - maximum number of characters after
+ offending element, used by error reporting, default is 20
+ Example: $parser->maxerrstr(20);
+ Function: get/set maxerrstr.
+ Returns: maxerrstr.
+ Notes:
+
+=head2 input_file
+
+ Parameters: $filename for file that contains Sequence record(s)
+ Example: $parser->input_file($filename);
+ Function: Takes in name of a file containing Sequence records.
+ opens the file and stores file handle
+ Returns: none.
+ Notes: Attempts to open file larger than 2 GB even on Perl that
+ does not support 2 GB file (accomplished by calling
+ "cat" and piping output. On OS that does not have "cat"
+ error message will be displayed)
+
+=head1 METHODS
+
+=head2 new
+
+ Parameters: maxerrstr => 20 (optional) - maximum number of characters after
+ offending element, used by error reporting, default is 20
+ file or -file => $filename (optional) - name of the file to be
+ parsed. call next_seq to parse!
+ fh or -fh => $filehandle (optional) - handle of the file to be
+ parsed.
+ Example: my $parser = Bio::ASN1::Sequence->new();
+ Function: Instantiate a parser object
+ Returns: Object reference
+ Notes: Setting file or fh will reset line numbers etc. that are used
+ for error reporting purposes, and seeking on file handle would
+ mess up linenumbers!
+
+=head2 parse
+
+ Parameters: $string that contains Sequence record,
+ $trimopt (optional) that specifies how the data structure
+ returned should be trimmed. 2 is recommended and
+ default
+ $noreset (optional) that species that line number should not
+ be reset
+ DEPRECATED as external function!!! Do not call this function
+ directly! Call next_seq() instead
+ Example: my $value = $parser->parse($text); # DEPRECATED as
+ # external function!!! Do not call this function
+ # directly! Call next_seq() instead
+ Function: Takes in a string representing Sequence record, parses
+ the record and returns a data structure.
+ Returns: A data structure containing all data items from the sequence
+ record.
+ Notes: DEPRECATED as external function!!! Do not call this function
+ directly! Call next_seq() instead
+ $string should not contain 'Seq-entry ::= set' at beginning!
+
+=head2 next_seq
+
+ Parameters: $trimopt (optional) that specifies how the data structure
+ returned should be trimmed. option 2 is recommended and
+ default
+ Example: my $value = $parser->next_seq();
+ Function: Use the file handle generated by input_file, parses the next
+ the record and returns a data structure.
+ Returns: A data structure containing all data items from the sequence
+ record.
+ Notes: Must pass in a filename through new() or input_file() first!
+ For details on how to use the $trimopt data trimming option
+ please see comment for the trimdata method. An option
+ of 2 is recommended and default
+ The acceptable values for $trimopt include:
+ 1 - trim as much as possibile
+ 2 (or 0, undef) - trim to an easy-to-use structure
+ 3 - no trimming (in version 1.06, prior to version
+ 1.06, 0 or undef means no trimming)
+
+=head2 trimdata
+
+ Parameters: $hashref or $arrayref
+ $trimflag (optional, see Notes)
+ Example: trimdata($datahash); # using the default flag
+ Function: recursively process all attributes of a hash/array
+ hybrid and get rid of any arrayref that points to
+ one-element arrays (trims data structure) depending on
+ the optional flag.
+ Returns: none - trimming happenes in-place
+ Notes: This function is useful to compact a data structure produced by
+ Bio::ASN1::Sequence::parse.
+ The acceptable values for $trimopt include:
+ 1 - trim as much as possibile
+ 2 (or 0, undef) - trim to an easy-to-use structure
+ 3 - no trimming (in version 1.06, prior to version
+ 1.06, 0 or undef means no trimming)
+ This function is duplicate to EntrezGene.pm's and code should
+ be compressed in the future (using util module & subclass).
+
+=head2 fh
+
+ Parameters: $filehandle (optional)
+ Example: trimdata($datahash); # using the default flag
+ Function: getter/setter for file handle
+ Returns: file handle for current file being parsed.
+ Notes: Use with care!
+ Line number report would not be corresponding to file's line
+ number if seek operation is performed on the file handle!
+
=head2 rawdata
Parameters: none
@@ -509,16 +486,83 @@ sub fh
Returns: a string containing the ASN1-formatted sequence record
Notes: Must first parse a record then call this function!
Could be useful in interpreting line number value in error
- report (if user did a seek on file handle right before parsing
+ report (if user did a seek on file handle right before parsing
call)
-=cut
+=head1 INTERNAL METHODS
-sub rawdata
-{
- my $self = shift;
- return "Seq-entry ::= set $self->{input}";
-}
+=head2 _parse
-1;
+NCBI's Apr 05, 2005 format change forced much usage of lookahead, which would for
+sure slows parser down. But can't code efficiently without it.
+
+=head1 PREREQUISITE
+
+None.
+
+=head1 INSTALLATION
+
+Bio::ASN1::Sequence is part of the Bio::ASN1::EntrezGene package.
+Bio::ASN1::EntrezGene package can be installed & tested as follows:
+
+ perl Makefile.PL
+ make
+ make test
+ make install
+
+=head1 SEE ALSO
+
+The parse_sequence_example.pl script included in this package (please
+see the Bio-ASN1-EntrezGene-x.xx/examples directory) shows the usage.
+
+Please check out perldoc for Bio::ASN1::EntrezGene for more info.
+=head1 CITATION
+
+Liu, Mingyi, and Andrei Grigoriev. "Fast parsers for Entrez Gene."
+Bioinformatics 21, no. 14 (2005): 3189-3190.
+
+=head1 OPERATION SYSTEMS SUPPORTED
+
+Any OS that Perl runs on.
+
+=head1 FEEDBACK
+
+=head2 Mailing lists
+
+User feedback is an integral part of the evolution of this and other
+Bioperl modules. Send your comments and suggestions preferably to
+the Bioperl mailing list. Your participation is much appreciated.
+
+ bioperl-l@bioperl.org - General discussion
+ http://bioperl.org/wiki/Mailing_lists - About the mailing lists
+
+=head2 Support
+
+Please direct usage questions or support issues to the mailing list:
+I<bioperl-l@bioperl.org>
+
+rather than to the module maintainer directly. Many experienced and
+reponsive experts will be able look at the problem and quickly
+address it. Please include a thorough description of the problem
+with code and data examples if at all possible.
+
+=head2 Reporting bugs
+
+Report bugs to the Bioperl bug tracking system to help us keep track
+of the bugs and their resolution. Bug reports can be submitted via the
+web:
+
+ https://redmine.open-bio.org/projects/bioperl/
+
+=head1 AUTHOR
+
+Dr. Mingyi Liu <mingyiliu@gmail.com>
+
+=head1 COPYRIGHT
+
+This software is copyright (c) 2005 by Mingyi Liu, 2005 by GPC Biotech AG, and 2005 by Altana Research Institute.
+
+This software is available under the same terms as the perl 5 programming language system itself.
+
+=cut
@@ -0,0 +1,47 @@
+use strict;
+use warnings;
+
+# this test was generated with Dist::Zilla::Plugin::Test::Compile 2.027
+
+use Test::More tests => 4 + ($ENV{AUTHOR_TESTING} ? 1 : 0);
+
+
+
+my @module_files = (
+ 'Bio/ASN1/EntrezGene.pm',
+ 'Bio/ASN1/EntrezGene/Indexer.pm',
+ 'Bio/ASN1/Sequence.pm',
+ 'Bio/ASN1/Sequence/Indexer.pm'
+);
+
+
+
+# no fake home requested
+
+use IPC::Open3;
+use IO::Handle;
+
+my @warnings;
+for my $lib (@module_files)
+{
+ # see L<perlfaq8/How can I capture STDERR from an external command?>
+ my $stdin = ''; # converted to a gensym by open3
+ my $stderr = IO::Handle->new;
+ binmode $stderr, ':crlf' if $^O eq 'MSWin32';
+
+ my $pid = open3($stdin, '>&STDERR', $stderr, qq{$^X -Mblib -e"require q[$lib]"});
+ waitpid($pid, 0);
+ is($? >> 8, 0, "$lib loaded ok");
+
+ if (my @_warnings = <$stderr>)
+ {
+ warn @_warnings;
+ push @warnings, @_warnings;
+ }
+}
+
+
+
+is(scalar(@warnings), 0, 'no warnings found') if $ENV{AUTHOR_TESTING};
+
+
diff --git a/var/tmp/source/MINGYILIU/Bio-ASN1-EntrezGene-1.091/Bio-ASN1-EntrezGene-1.09/t/input.asn b/var/tmp/source/CJFIELDS/Bio-ASN1-EntrezGene-1.70/Bio-ASN1-EntrezGene-1.70/t/input.asn
old mode 100644
new mode 100755
diff --git a/var/tmp/source/MINGYILIU/Bio-ASN1-EntrezGene-1.091/Bio-ASN1-EntrezGene-1.09/t/input1.asn b/var/tmp/source/CJFIELDS/Bio-ASN1-EntrezGene-1.70/Bio-ASN1-EntrezGene-1.70/t/input1.asn
old mode 100644
new mode 100755
@@ -0,0 +1,16 @@
+
+BEGIN {
+ unless ($ENV{RELEASE_TESTING}) {
+ require Test::More;
+ Test::More::plan(skip_all => 'these tests are for release candidate testing');
+ }
+}
+
+use strict;
+use warnings;
+use Test::More;
+
+eval 'use Test::EOL';
+plan skip_all => 'Test::EOL required' if $@;
+
+all_perl_files_ok({ trailing_whitespace => 1 });
@@ -0,0 +1,20 @@
+#!perl
+
+BEGIN {
+ unless ($ENV{RELEASE_TESTING}) {
+ require Test::More;
+ Test::More::plan(skip_all => 'these tests are for release candidate testing');
+ }
+}
+
+
+use strict;
+use warnings qw(all);
+
+use Test::More;
+
+## no critic (ProhibitStringyEval, RequireCheckingReturnValueOfEval)
+eval q(use Test::Mojibake);
+plan skip_all => q(Test::Mojibake required for source encoding testing) if $@;
+
+all_files_encoding_ok();
@@ -0,0 +1,16 @@
+
+BEGIN {
+ unless ($ENV{RELEASE_TESTING}) {
+ require Test::More;
+ Test::More::plan(skip_all => 'these tests are for release candidate testing');
+ }
+}
+
+use strict;
+use warnings;
+use Test::More;
+
+eval 'use Test::NoTabs';
+plan skip_all => 'Test::NoTabs required' if $@;
+
+all_perl_files_ok();
@@ -0,0 +1,21 @@
+#!perl
+
+BEGIN {
+ unless ($ENV{RELEASE_TESTING}) {
+ require Test::More;
+ Test::More::plan(skip_all => 'these tests are for release candidate testing');
+ }
+}
+
+
+use Test::More;
+
+eval "use Test::Pod::Coverage 1.08";
+plan skip_all => "Test::Pod::Coverage 1.08 required for testing POD coverage"
+ if $@;
+
+eval "use Pod::Coverage::TrustPod";
+plan skip_all => "Pod::Coverage::TrustPod required for testing POD coverage"
+ if $@;
+
+all_pod_coverage_ok({ coverage_class => 'Pod::Coverage::TrustPod' });
@@ -0,0 +1,15 @@
+#!perl
+
+BEGIN {
+ unless ($ENV{RELEASE_TESTING}) {
+ require Test::More;
+ Test::More::plan(skip_all => 'these tests are for release candidate testing');
+ }
+}
+
+use Test::More;
+
+eval "use Test::Pod 1.41";
+plan skip_all => "Test::Pod 1.41 required for testing POD" if $@;
+
+all_pod_files_ok();
diff --git a/var/tmp/source/MINGYILIU/Bio-ASN1-EntrezGene-1.091/Bio-ASN1-EntrezGene-1.09/t/seq.asn b/var/tmp/source/CJFIELDS/Bio-ASN1-EntrezGene-1.70/Bio-ASN1-EntrezGene-1.70/t/seq.asn
old mode 100644
new mode 100755
@@ -1,77 +1,98 @@
#!/usr/bin/env perl -w
use strict;
-use Test::More tests => 11;
+use File::Spec;
+use Test::More tests => 6;
-my ($noindex, $noabseq, $nogene, $noseq, $noseqindex);
+sub check_dependency {
+ my $class = shift;
+ eval "require $class; 1";
+ if ($@) {
+ return;
+ }
+ 1;
+}
+
+my ( $noindex, $noabseq, $nogene, $noseq, $noseqindex );
BEGIN {
- diag("\n\nTest indexers (Bio::ASN1::EntrezGene::Indexer, Bio::ASN1::Sequence::Indexer)\nIndexing and retrieval:\n");
- use_ok('Bio::ASN1::EntrezGene') || $nogene++;
- use_ok('Bio::Index::AbstractSeq') || $noabseq++;
- use_ok('Bio::ASN1::EntrezGene::Indexer') || $noindex++;
- use_ok('Bio::ASN1::Sequence') || $noseq++;
- use_ok('Bio::ASN1::Sequence::Indexer') || $noseqindex++;
+ diag(
+"\n\nTest indexers (Bio::ASN1::EntrezGene::Indexer, Bio::ASN1::Sequence::Indexer)\nIndexing and retrieval:\n"
+ );
+ check_dependency('Bio::ASN1::EntrezGene') || $nogene++;
+ check_dependency('Bio::Index::AbstractSeq') || $noabseq++;
+ check_dependency('Bio::ASN1::EntrezGene::Indexer') || $noindex++;
+ check_dependency('Bio::ASN1::Sequence') || $noseq++;
+ check_dependency('Bio::ASN1::Sequence::Indexer') || $noseqindex++;
}
diag("\n\nFirst testing gene indexer:\n");
-if(!$nogene)
-{
- # test indexer
- if(!$noabseq)
- {
- if(!$noindex)
- {
- my $inx = Bio::ASN1::EntrezGene::Indexer->new(-filename => 't/testgene.idx',
- -write_flag => 'WRITE');
- isa_ok($inx, 'Bio::ASN1::EntrezGene::Indexer');
- $inx->make_index('t/input.asn', 't/input1.asn');
+SKIP: {
+ if ( !$nogene ) {
+ skip( "BioPerl not installed, skipping", 3 ) if $noabseq;
+
+ # test indexer
+ if ( !$noabseq ) {
+ if ( !$noindex ) {
+ my $inx = Bio::ASN1::EntrezGene::Indexer->new(
+ -filename => File::Spec->catfile('t','testgene.idx'),
+ -write_flag => 'WRITE'
+ );
+ isa_ok( $inx, 'Bio::ASN1::EntrezGene::Indexer' );
+ $inx->make_index( File::Spec->catfile('t','input.asn'), File::Spec->catfile('t','input1.asn' ));
+
# cmp_ok($inx->count_records, '==', 4, 'total number of indexed gene records');
- my $value = $inx->fetch_hash(3);
- isa_ok($value, 'ARRAY');
- cmp_ok($value->[0]{'track-info'}[0]{geneid}, '==', 3, 'correct gene record retrieved');
+ my $value = $inx->fetch_hash(3);
+ isa_ok( $value, 'ARRAY' );
+ cmp_ok( $value->[0]{'track-info'}[0]{geneid},
+ '==', 3, 'correct gene record retrieved' );
+ }
+ else {
+ diag(
+"\nThere's some problem with the installation of Bio::ASN1::EntrezGene::Indexer!\nTry install again using:\n\tperl Makefile.PL\n\tmake\nQuitting now"
+ );
+ }
+ }
}
- else
- {
- diag("\nThere's some problem with the installation of Bio::ASN1::EntrezGene::Indexer!\nTry install again using:\n\tperl Makefile.PL\n\tmake\nQuitting now");
+ else {
+ diag(
+"\nThere's some problem with the installation of Bio::ASN1::EntrezGene!\nTry install again using:\n\tperl Makefile.PL\n\tmake\nQuitting now"
+ );
}
- }
- else
- {
- diag("\nYou need to have Bio::Index::AbstractSeq (bioperl.org)\ninstalled for testing the indexer!\nQuitting now");
- }
-}
-else
-{
- diag("\nThere's some problem with the installation of Bio::ASN1::EntrezGene!\nTry install again using:\n\tperl Makefile.PL\n\tmake\nQuitting now");
+ diag("\n\nNow testing sequence indexer:\n");
}
-diag("\n\nNow testing sequence indexer:\n");
-if(!$noseq)
-{
- # test indexer
- if(!$noabseq)
- {
- if(!$noseqindex)
- {
- my $inx = Bio::ASN1::Sequence::Indexer->new(-filename => 't/testseq.idx',
- -write_flag => 'WRITE');
- isa_ok($inx, 'Bio::ASN1::Sequence::Indexer');
- $inx->make_index('t/seq.asn');
+
+SKIP: {
+ if ( !$noseq ) {
+ skip( "BioPerl not installed, skipping", 3 ) if $noabseq;
+
+ # test indexer
+ if ( !$noabseq ) {
+ if ( !$noseqindex ) {
+ my $inx = Bio::ASN1::Sequence::Indexer->new(
+ -filename => File::Spec->catfile('t','testseq.idx'),
+ -write_flag => 'WRITE'
+ );
+ isa_ok( $inx, 'Bio::ASN1::Sequence::Indexer' );
+ $inx->make_index(File::Spec->catfile('t','seq.asn'));
+
# cmp_ok($inx->count_records, '==', 2, 'total number of sequence ids in index');
- my $value = $inx->fetch_hash('AF093062');
- isa_ok($value, 'ARRAY');
- cmp_ok($value->[0]{'seq-set'}[0]{seq}[0]{id}[0]{genbank}[0]{accession}, 'eq', 'AF093062', 'correct sequence record retrieved');
+ my $value = $inx->fetch_hash('AF093062');
+ isa_ok( $value, 'ARRAY' );
+ cmp_ok(
+ $value->[0]{'seq-set'}[0]{seq}[0]{id}[0]{genbank}[0]
+ {accession},
+ 'eq', 'AF093062', 'correct sequence record retrieved'
+ );
+ }
+ else {
+ diag(
+"\nThere's some problem with the installation of Bio::ASN1::Sequence::Indexer!\nTry install again using:\n\tperl Makefile.PL\n\tmake\nQuitting now"
+ );
+ }
+ }
}
- else
- {
- diag("\nThere's some problem with the installation of Bio::ASN1::Sequence::Indexer!\nTry install again using:\n\tperl Makefile.PL\n\tmake\nQuitting now");
+ else {
+ diag(
+"\nThere's some problem with the installation of Bio::ASN1::Sequence!\nTry install again using:\n\tperl Makefile.PL\n\tmake\nQuitting now"
+ );
}
- }
- else
- {
- diag("\nYou need to have Bio::Index::AbstractSeq (bioperl.org)\ninstalled for testing the indexer!\nQuitting now");
- }
}
-else
-{
- diag("\nThere's some problem with the installation of Bio::ASN1::Sequence!\nTry install again using:\n\tperl Makefile.PL\n\tmake\nQuitting now");
-}
-
@@ -1,5 +1,7 @@
#!/usr/bin/env perl -w
use strict;
+use warnings;
+use File::Spec;
use Test::More tests => 10;
my ($nogene, $noseq);
@@ -12,7 +14,7 @@ BEGIN {
diag("\n\nFirst testing gene parser:\n");
if(!$nogene)
{
- my $parser = Bio::ASN1::EntrezGene->new(file => 't/input.asn');
+ my $parser = Bio::ASN1::EntrezGene->new(file => File::Spec->catfile('t','input.asn'));
isa_ok($parser, 'Bio::ASN1::EntrezGene');
my $value = $parser->next_seq;
isa_ok($value, 'ARRAY');
@@ -27,7 +29,7 @@ else
diag("\n\nNow testing sequence parser:\n");
if(!$noseq)
{
- my $parser = Bio::ASN1::Sequence->new(file => 't/seq.asn');
+ my $parser = Bio::ASN1::Sequence->new(file => File::Spec->catfile('t','seq.asn'));
isa_ok($parser, 'Bio::ASN1::Sequence');
my $value = $parser->next_seq;
isa_ok($value, 'ARRAY');