The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#------------------------------------------------------------------------------
# Pegex Grammar for YAML 1.2
#
# This is a PEG (top-down) grammar for the YAML 1.2 language. It is in the
# Pegex format, and can be used to construct a YAML parser in any language
# where Pegex has been ported to. (Currently Perl, Ruby and JavaScript).
#
# Compared to the official YAML spec, this grammar should be much easier to
# read and understand. It will also be fully documented, and will attempt to
# have a test suite that exercises every rule path.
#
# The overall intent of this is to have one working grammar that backs up a
# full YAML framework implementation in every programming language where YAML
# is used. If this is acheived, then a bug in YAML can be fixed in one place,
# for every language's implementaion.
#
# This grammar will go further than just parsing correct YAML. It will also
# parse for common YAML errors, and try to report the most useful error
# messages.
#------------------------------------------------------------------------------

# Notes:
# - Indentation will be done with indent / ondent / undent
# - Need to check some rules against spec for accuracy.
# - Make the grammar strict as possible until justified.
# - Need to look for common errors in the grammar, and report them.
# - Need to have tests for known errors.

%grammar yaml
%version 0.0.1

#------------------------------------------------------------------------------
# High Level Constructs
#------------------------------------------------------------------------------

# A YAML Stream is the top level rule, and accounts for the entirety of the
# text being parsed. Basically, a stream is a set of zero or more documents,
# but there can be ignorable comments on either side of an explicitly marked
# document. NOTE: Not yet dealing with directives.
yaml-stream:
  ignore-line*
  (
    yaml-document
    ignore-line*
  )*

# A YAML Document is a single node of any kind. It may start with an optional
# explicit head marker, and may be terminated with an optional explicit foot
# marker.
yaml-document:
  document-head?
  top-node
  # It is important to make sure we are on a line boundary here:
  ignore-line?
  document-foot?

# A top level node can be quite a few distinct things:
top-node:
  node-prefix? (
    | node-alias
    | flow-mapping
    | flow-sequence
    | block-sequence
    | block-mapping
    | block-scalar
  ) ( EOL? )

#------------------------------------------------------------------------------
# Block Constructs
#------------------------------------------------------------------------------

# This rule identifies all the block nodes:
block-node:
  | block-sequence
  | block-mapping
  | block-scalar

# A block sequence is an indented set of nodes each starting with a
# dash+space:
block-sequence:
  block-sequence-entry+

# TODO This needs to support and block-node:
block-sequence-entry:
  / DASH SPACE+ block-scalar EOL /

# A block mapping is an indented set of key / value pairs separated by
# colon+space:
block-mapping:
  block-indent
  block-mapping-pair+
  block-undent

# A block mapping pair is a key / value separated by colon+space:
block-mapping-pair:
  block-ondent
  block-key
  block-mapping-separator
  block-value

# block key scalar, has more limitations than a block value scalar.
block-key: / block-scalar (= block-mapping-separator ) /

# A block value can be any block or flow node:
block-value:
  | flow-mapping
  | flow-sequence
  | block-node

# A scalar in block form can take one of these 5 forms:
block-scalar: /
  ( literal-scalar
  | folded-scalar
  | double-quoted-scalar
  | single-quoted-scalar
  | block-plain-scalar
  )
/

#------------------------------------------------------------------------------
# Flow Constructs:
#------------------------------------------------------------------------------

# A flow node can be any one of these 3 kinds:
flow-node:
  | flow-sequence
  | flow-mapping
  | flow-scalar

# A flow sequence is zero or more nodes, separated by commas, inside square
# brackets. A trailing comma is allowed.
flow-sequence:
  flow-sequence-start
  flow-sequence-entry* %% list-separator
  flow-sequence-end

# A flow mapping is key / value pairs, separated by commas, inside curly
# braces. A trailing comma is allowed.
flow-mapping:
  flow-mapping-start
  flow-mapping-pair* %% list-separator
  flow-mapping-end

# A flow scalar only has 3 basic forms:
flow-scalar: /
  ( double-quoted-scalar
  | single-quoted-scalar
  | flow-plain-scalar
  )
/

# A flow sequence entry is any flow node. This rule is an alias, and can maybe
# go away later, but leaving this way now for clarity.
flow-sequence-entry: flow-scalar

# A flow mapping can have any node as key or value, but they must also be in
# flow syntax.
flow-mapping-pair:
  flow-node
  flow-mapping-separator
  flow-node

# Starting and ending rules for flow collections:
flow-sequence-start: /- '[' -/
flow-sequence-end: /- ']' -/
flow-mapping-start: /- '{' -/
flow-mapping-end: /- '}' -/

#------------------------------------------------------------------------------
# Scalar Constructs
#------------------------------------------------------------------------------

# Literal scalar.
# XXX Dummied out for now.
literal-scalar: / '|' EOL 'XXX' /

# Folded scalar.
# XXX Dummied out for now.
folded-scalar: / '>' EOL 'XXX' /

# Double quoted scalar.
# XXX Needs work.
double-quoted-scalar: / DOUBLE [^ DOUBLE]* DOUBLE /

# Single quoted scalar.
# XXX Needs work.
single-quoted-scalar: / SINGLE [^ SINGLE]* SINGLE /

# Plain (unquoted) scalars can't start with syntax chars, and can't contain
# colon+space.
block-plain-scalar: /
  (! char-non-start)
  ANY+?
  (= COLON WS | EOL | EOS)
/

# Plain (unquoted) scalars in flow context are more restrictive than in block
# context.
flow-plain-scalar: /
  (! char-non-start)
  ANY+?
  (= [ chars-syntax COMMA ] | COLON SPACE | COMMA SPACE | EOL | EOS)
/

#------------------------------------------------------------------------------
# Other Constructs:
#------------------------------------------------------------------------------

# block-indent: # This rule is written in code in the Grammar class.
# block-ondent: # This rule is written in code in the Grammar class.
# block-undent: # This rule is written in code in the Grammar class.

# A YAML header is 3 dashes followed by spaces or a newline:
document-head: / '---' (: SPACE+ | (?= EOL)) /

# A YAML footer is 3 dots followed by a newline:
document-foot: / '...' EOL /

# A node prefix is a anchor and / or tag in any order.
# XXX This construct is hard in PEG. Look for easier way.
node-prefix:
  | node-anchor (SPACE+ node-tag)?
  | node-tag (SPACE+ node-anchor)?

# An explicit node tag.
# TODO This is very incomplete!
node-tag: / BANG BANG? ( WORD+ ) /

# A Node Anchor is a name for a node. Like '&this'.
# TODO See spec for real definition.
node-anchor: / '&' ( WORD+ ) /

# A Node Alias is a reference to an anchored node. Like '*this'.
node-alias: / '*' ( WORD+ ) /

# Mapping key / value is always separated by ': ' (colon + space)
flow-mapping-separator: / ':' (: SPACE+ | SPACE* (= EOL)) /
block-mapping-separator: / ':' (: SPACE+ | SPACE* (= EOL)) /

# List items separated by ',' (comma)
# XXX Check spec if SPACE is needed
list-separator: / ',' SPACE+ /

# List of single chars that are YAML syntax (and thus must be avoided in
# various contexts.
chars-syntax: /
  AMP
  STAR
  HASH
  LCURLY
  RCURLY
  LSQUARE
  RSQUARE
  PERCENT
/

# YAML's Reserved Chars
chars-reserved: /
  GRAVE
  AT
/

char-non-start: /[
  chars-syntax
  chars-reserved
]/


#------------------------------------------------------------------------------
# Whitespace Rules:
#------------------------------------------------------------------------------

# TODO Need to determine the - and + whitespace rule.

# Ignore comments and whitespace until end of line.
ignore-line: / ignore-text (= EOL) /

# Ignorable text is spaces, tabs and a line comment.
ignore-text: / (: comment-text | blank-text ) /

# A '#' starts a comment until end of line.
comment-text: / HASH ANY* /

# Spaces and tabs.
blank-text: / BLANK* /

# Vim Helpers, until we get `pegex.vim` mode.
# vim: set lisp sw=2: