1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642
|
package yaml
import (
"bytes"
"encoding/base64"
"errors"
"fmt"
"io"
"regexp"
"strconv"
"strings"
"sync"
"gopkg.in/yaml.v3"
"cuelang.org/go/cue/ast"
"cuelang.org/go/cue/literal"
"cuelang.org/go/cue/token"
"cuelang.org/go/internal"
)
// TODO(mvdan): we should sanity check that the decoder always produces valid CUE,
// as it is possible to construct a cue/ast syntax tree with invalid literals
// or with expressions that will always error, such as `float & 123`.
//
// One option would be to do this as part of the tests; a more general approach
// may be fuzzing, which would find more bugs and work for any decoder,
// although it may be slow as we need to involve the evaluator.
// Decoder is a temporary interface compatible with both the old and new yaml decoders.
type Decoder interface {
// Decode consumes a YAML value and returns it in CUE syntax tree node.
Decode() (ast.Expr, error)
}
// decoder wraps a [yaml.Decoder] to extract CUE syntax tree nodes.
type decoder struct {
yamlDecoder yaml.Decoder
// yamlNonEmpty is true once yamlDecoder tells us the input YAML wasn't empty.
// Useful so that we can extract "null" when the input is empty.
yamlNonEmpty bool
// decodeErr is returned by any further calls to Decode when not nil.
decodeErr error
tokFile *token.File
tokLines []int
// pendingHeadComments collects the head (preceding) comments
// from the YAML nodes we are extracting.
// We can't add comments to a CUE syntax tree node until we've created it,
// but we need to extract these comments first since they have earlier positions.
pendingHeadComments []*ast.Comment
// extractingAliases ensures we don't loop forever when expanding YAML anchors.
extractingAliases map[*yaml.Node]bool
// lastPos is the last YAML node position that we decoded,
// used for working out relative positions such as token.NewSection.
// This position can only increase, moving forward in the file.
lastPos token.Position
// forceNewline ensures that the next position will be on a new line.
forceNewline bool
}
// TODO(mvdan): this can be io.Reader really, except that token.Pos is offset-based,
// so the only way to really have true Offset+Line+Col numbers is to know
// the size of the entire YAML node upfront.
// With json we can use RawMessage to know the size of the input
// before we extract into ast.Expr, but unfortunately, yaml.Node has no size.
// NewDecoder creates a decoder for YAML values to extract CUE syntax tree nodes.
//
// The filename is used for position information in CUE syntax tree nodes
// as well as any errors encountered while decoding YAML.
func NewDecoder(filename string, b []byte) *decoder {
// Note that yaml.v3 can insert a null node just past the end of the input
// in some edge cases, so we pretend that there's an extra newline
// so that we don't panic when handling such a position.
tokFile := token.NewFile(filename, 0, len(b)+1)
tokFile.SetLinesForContent(b)
return &decoder{
tokFile: tokFile,
tokLines: append(tokFile.Lines(), len(b)),
yamlDecoder: *yaml.NewDecoder(bytes.NewReader(b)),
}
}
// Decode consumes a YAML value and returns it in CUE syntax tree node.
//
// A nil node with an io.EOF error is returned once no more YAML values
// are available for decoding.
func (d *decoder) Decode() (ast.Expr, error) {
if err := d.decodeErr; err != nil {
return nil, err
}
var yn yaml.Node
if err := d.yamlDecoder.Decode(&yn); err != nil {
if err == io.EOF {
// Any further Decode calls must return EOF to avoid an endless loop.
d.decodeErr = io.EOF
// If the input is empty, we produce a single null literal with EOF.
// Note that when the input contains "---", we get an empty document
// with a null scalar value inside instead.
if !d.yamlNonEmpty {
return &ast.BasicLit{
Kind: token.NULL,
Value: "null",
}, nil
}
// If the input wasn't empty, we already decoded some CUE syntax nodes,
// so here we should just return io.EOF to stop.
return nil, io.EOF
}
// Unfortunately, yaml.v3's syntax errors are opaque strings,
// and they only include line numbers in some but not all cases.
// TODO(mvdan): improve upstream's errors so they are structured
// and always contain some position information.
e := err.Error()
if s, ok := strings.CutPrefix(e, "yaml: line "); ok {
// From "yaml: line 3: some issue" to "foo.yaml:3: some issue".
e = d.tokFile.Name() + ":" + s
} else if s, ok := strings.CutPrefix(e, "yaml:"); ok {
// From "yaml: some issue" to "foo.yaml: some issue".
e = d.tokFile.Name() + ":" + s
} else {
return nil, err
}
err = errors.New(e)
// Any further Decode calls repeat this error.
d.decodeErr = err
return nil, err
}
d.yamlNonEmpty = true
return d.extract(&yn)
}
// Unmarshal parses a single YAML value to a CUE expression.
func Unmarshal(filename string, data []byte) (ast.Expr, error) {
d := NewDecoder(filename, data)
n, err := d.Decode()
if err != nil {
if err == io.EOF {
return nil, nil // empty input
}
return nil, err
}
// TODO(mvdan): decoding the entire next value is unnecessary;
// consider either a "More" or "Done" method to tell if we are at EOF,
// or splitting the Decode method into two variants.
// This should use proper error values with positions as well.
if n2, err := d.Decode(); err == nil {
return nil, fmt.Errorf("%s: expected a single YAML document", n2.Pos())
} else if err != io.EOF {
return nil, fmt.Errorf("expected a single YAML document: %v", err)
}
return n, nil
}
func (d *decoder) extract(yn *yaml.Node) (ast.Expr, error) {
d.addHeadCommentsToPending(yn)
var expr ast.Expr
var err error
switch yn.Kind {
case yaml.DocumentNode:
expr, err = d.document(yn)
case yaml.SequenceNode:
expr, err = d.sequence(yn)
case yaml.MappingNode:
expr, err = d.mapping(yn)
case yaml.ScalarNode:
expr, err = d.scalar(yn)
case yaml.AliasNode:
expr, err = d.alias(yn)
default:
return nil, d.posErrorf(yn, "unknown yaml node kind: %d", yn.Kind)
}
if err != nil {
return nil, err
}
d.addCommentsToNode(expr, yn, 1)
return expr, nil
}
// comments parses a newline-delimited list of YAML "#" comments
// and turns them into a list of cue/ast comments.
func (d *decoder) comments(src string) []*ast.Comment {
if src == "" {
return nil
}
var comments []*ast.Comment
for _, line := range strings.Split(src, "\n") {
if line == "" {
continue // yaml.v3 comments have a trailing newline at times
}
comments = append(comments, &ast.Comment{
// Trim the leading "#".
// Note that yaml.v3 does not give us comment positions.
Text: "//" + line[1:],
})
}
return comments
}
// addHeadCommentsToPending parses a node's head comments and adds them to a pending list,
// to be used later by addComments once a cue/ast node is constructed.
func (d *decoder) addHeadCommentsToPending(yn *yaml.Node) {
comments := d.comments(yn.HeadComment)
// TODO(mvdan): once yaml.v3 records comment positions,
// we can better ensure that sections separated by empty lines are kept that way.
// For now, all we can do is approximate by counting lines,
// and assuming that head comments are not separated from their node.
// This will be wrong in some cases, moving empty lines, but is better than nothing.
if len(d.pendingHeadComments) == 0 && len(comments) > 0 {
c := comments[0]
if d.lastPos.IsValid() && (yn.Line-len(comments))-d.lastPos.Line >= 2 {
c.Slash = c.Slash.WithRel(token.NewSection)
}
}
d.pendingHeadComments = append(d.pendingHeadComments, comments...)
}
// addCommentsToNode adds any pending head comments, plus a YAML node's line
// and foot comments, to a cue/ast node.
func (d *decoder) addCommentsToNode(n ast.Node, yn *yaml.Node, linePos int8) {
// cue/ast and cue/format are not able to attach a comment to a node
// when the comment immediately follows the node.
// For some nodes like fields, the best we can do is move the comments up.
// For the root-level struct, we do want to leave comments
// at the end of the document to be left at the very end.
//
// TODO(mvdan): can we do better? for example, support attaching trailing comments to a cue/ast.Node?
footComments := d.comments(yn.FootComment)
if _, ok := n.(*ast.StructLit); !ok {
d.pendingHeadComments = append(d.pendingHeadComments, footComments...)
footComments = nil
}
if comments := d.pendingHeadComments; len(comments) > 0 {
ast.AddComment(n, &ast.CommentGroup{
Doc: true,
Position: 0,
List: comments,
})
}
if comments := d.comments(yn.LineComment); len(comments) > 0 {
ast.AddComment(n, &ast.CommentGroup{
Line: true,
Position: linePos,
List: comments,
})
}
if comments := footComments; len(comments) > 0 {
ast.AddComment(n, &ast.CommentGroup{
// After 100 tokens, so that the comment goes after the entire node.
// TODO(mvdan): this is hacky, can the cue/ast API support trailing comments better?
Position: 100,
List: comments,
})
}
d.pendingHeadComments = nil
}
func (d *decoder) posErrorf(yn *yaml.Node, format string, args ...any) error {
// TODO(mvdan): use columns as well; for now they are left out to avoid test churn
// return fmt.Errorf(d.pos(n).String()+" "+format, args...)
return fmt.Errorf(d.tokFile.Name()+":"+strconv.Itoa(yn.Line)+": "+format, args...)
}
// pos converts a YAML node position to a cue/ast position.
// Note that this method uses and updates the last position in lastPos,
// so it should be called on YAML nodes in increasing position order.
func (d *decoder) pos(yn *yaml.Node) token.Pos {
// Calculate the position's offset via the line and column numbers.
offset := d.tokLines[yn.Line-1] + (yn.Column - 1)
pos := d.tokFile.Pos(offset, token.NoRelPos)
if d.forceNewline {
d.forceNewline = false
pos = pos.WithRel(token.Newline)
} else if d.lastPos.IsValid() {
switch {
case yn.Line-d.lastPos.Line >= 2:
pos = pos.WithRel(token.NewSection)
case yn.Line-d.lastPos.Line == 1:
pos = pos.WithRel(token.Newline)
case yn.Column-d.lastPos.Column > 0:
pos = pos.WithRel(token.Blank)
default:
pos = pos.WithRel(token.NoSpace)
}
// If for any reason the node's position is before the last position,
// give up and return an empty position. Akin to: yn.Pos().Before(d.lastPos)
//
// TODO(mvdan): Brought over from the old decoder; when does this happen?
// Can we get rid of those edge cases and this bit of logic?
if yn.Line < d.lastPos.Line || (yn.Line == d.lastPos.Line && yn.Column < d.lastPos.Column) {
return token.NoPos
}
}
d.lastPos = token.Position{Line: yn.Line, Column: yn.Column}
return pos
}
func (d *decoder) document(yn *yaml.Node) (ast.Expr, error) {
if n := len(yn.Content); n != 1 {
return nil, d.posErrorf(yn, "yaml document nodes are meant to have one content node but have %d", n)
}
return d.extract(yn.Content[0])
}
func (d *decoder) sequence(yn *yaml.Node) (ast.Expr, error) {
list := &ast.ListLit{
Lbrack: d.pos(yn).WithRel(token.Blank),
}
multiline := false
if len(yn.Content) > 0 {
multiline = yn.Line < yn.Content[len(yn.Content)-1].Line
}
// If a list is empty, or ends with a struct, the closing `]` is on the same line.
closeSameLine := true
for _, c := range yn.Content {
d.forceNewline = multiline
elem, err := d.extract(c)
if err != nil {
return nil, err
}
list.Elts = append(list.Elts, elem)
// A list of structs begins with `[{`, so let it end with `}]`.
_, closeSameLine = elem.(*ast.StructLit)
}
if multiline && !closeSameLine {
list.Rbrack = list.Rbrack.WithRel(token.Newline)
}
return list, nil
}
func (d *decoder) mapping(yn *yaml.Node) (ast.Expr, error) {
strct := &ast.StructLit{}
multiline := false
if len(yn.Content) > 0 {
multiline = yn.Line < yn.Content[len(yn.Content)-1].Line
}
if err := d.insertMap(yn, strct, multiline, false); err != nil {
return nil, err
}
// TODO(mvdan): moving these positions above insertMap breaks a few tests, why?
strct.Lbrace = d.pos(yn).WithRel(token.Blank)
if multiline {
strct.Rbrace = strct.Lbrace.WithRel(token.Newline)
} else {
strct.Rbrace = strct.Lbrace
}
return strct, nil
}
func (d *decoder) insertMap(yn *yaml.Node, m *ast.StructLit, multiline, mergeValues bool) error {
l := len(yn.Content)
outer:
for i := 0; i < l; i += 2 {
if multiline {
d.forceNewline = true
}
yk, yv := yn.Content[i], yn.Content[i+1]
d.addHeadCommentsToPending(yk)
if isMerge(yk) {
mergeValues = true
if err := d.merge(yv, m, multiline); err != nil {
return err
}
continue
}
if yk.Kind != yaml.ScalarNode {
return d.posErrorf(yn, "invalid map key: %v", yk.ShortTag())
}
field := &ast.Field{}
label, err := d.label(yk)
if err != nil {
return err
}
d.addCommentsToNode(field, yk, 2)
field.Label = label
if mergeValues {
key := labelStr(label)
for _, decl := range m.Elts {
f := decl.(*ast.Field)
name, _, err := ast.LabelName(f.Label)
if err == nil && name == key {
f.Value, err = d.extract(yv)
if err != nil {
return err
}
continue outer
}
}
}
value, err := d.extract(yv)
if err != nil {
return err
}
field.Value = value
m.Elts = append(m.Elts, field)
}
return nil
}
func (d *decoder) merge(yn *yaml.Node, m *ast.StructLit, multiline bool) error {
switch yn.Kind {
case yaml.MappingNode:
return d.insertMap(yn, m, multiline, true)
case yaml.AliasNode:
return d.insertMap(yn.Alias, m, multiline, true)
case yaml.SequenceNode:
// Step backwards as earlier nodes take precedence.
for i := len(yn.Content) - 1; i >= 0; i-- {
if err := d.merge(yn.Content[i], m, multiline); err != nil {
return err
}
}
return nil
default:
return d.posErrorf(yn, "map merge requires map or sequence of maps as the value")
}
}
func (d *decoder) label(yn *yaml.Node) (ast.Label, error) {
pos := d.pos(yn)
expr, err := d.scalar(yn)
if err != nil {
return nil, err
}
switch expr := expr.(type) {
case *ast.BasicLit:
if expr.Kind == token.STRING {
if ast.IsValidIdent(yn.Value) && !internal.IsDefOrHidden(yn.Value) {
return &ast.Ident{
NamePos: pos,
Name: yn.Value,
}, nil
}
ast.SetPos(expr, pos)
return expr, nil
}
return &ast.BasicLit{
ValuePos: pos,
Kind: token.STRING,
Value: literal.Label.Quote(expr.Value),
}, nil
default:
return nil, d.posErrorf(yn, "invalid label "+yn.Value)
}
}
const (
// TODO(mvdan): The strings below are from yaml.v3; should we be relying on upstream somehow?
nullTag = "!!null"
boolTag = "!!bool"
strTag = "!!str"
intTag = "!!int"
floatTag = "!!float"
timestampTag = "!!timestamp"
seqTag = "!!seq"
mapTag = "!!map"
binaryTag = "!!binary"
mergeTag = "!!merge"
)
// rxAnyOctalYaml11 uses the implicit tag resolution regular expression for base-8 integers
// from YAML's 1.1 spec, but including the 8 and 9 digits which aren't valid for octal integers.
var rxAnyOctalYaml11 = sync.OnceValue(func() *regexp.Regexp {
return regexp.MustCompile(`^[-+]?0[0-9_]+$`)
})
func (d *decoder) scalar(yn *yaml.Node) (ast.Expr, error) {
tag := yn.ShortTag()
// If the YAML scalar has no explicit tag, yaml.v3 infers a float tag,
// and the value looks like a YAML 1.1 octal literal,
// that means the input value was like `01289` and not a valid octal integer.
// The safest thing to do, and what most YAML decoders do, is to interpret as a string.
if yn.Style&yaml.TaggedStyle == 0 && tag == floatTag && rxAnyOctalYaml11().MatchString(yn.Value) {
tag = strTag
}
switch tag {
// TODO: use parse literal or parse expression instead.
case timestampTag:
return &ast.BasicLit{
ValuePos: d.pos(yn),
Kind: token.STRING,
Value: literal.String.Quote(yn.Value),
}, nil
case strTag:
return &ast.BasicLit{
ValuePos: d.pos(yn),
Kind: token.STRING,
Value: literal.String.WithOptionalTabIndent(1).Quote(yn.Value),
}, nil
case binaryTag:
data, err := base64.StdEncoding.DecodeString(yn.Value)
if err != nil {
return nil, d.posErrorf(yn, "!!binary value contains invalid base64 data")
}
return &ast.BasicLit{
ValuePos: d.pos(yn),
Kind: token.STRING,
Value: literal.Bytes.Quote(string(data)),
}, nil
case boolTag:
t := false
switch yn.Value {
// TODO(mvdan): The strings below are from yaml.v3; should we be relying on upstream somehow?
case "true", "True", "TRUE":
t = true
}
lit := ast.NewBool(t)
lit.ValuePos = d.pos(yn)
return lit, nil
case intTag:
// Convert YAML octal to CUE octal. If YAML accepted an invalid
// integer, just convert it as well to ensure CUE will fail.
value := yn.Value
if len(value) > 1 && value[0] == '0' && value[1] <= '9' {
value = "0o" + value[1:]
}
var info literal.NumInfo
// We make the assumption that any valid YAML integer literal will be a valid
// CUE integer literal as well, with the only exception of octal numbers above.
// Note that `!!int 123.456` is not allowed.
if err := literal.ParseNum(value, &info); err != nil {
return nil, d.posErrorf(yn, "cannot decode %q as %s: %v", value, tag, err)
} else if !info.IsInt() {
return nil, d.posErrorf(yn, "cannot decode %q as %s: not a literal number", value, tag)
}
return d.makeNum(yn, value, token.INT), nil
case floatTag:
value := yn.Value
// TODO(mvdan): The strings below are from yaml.v3; should we be relying on upstream somehow?
switch value {
case ".inf", ".Inf", ".INF", "+.inf", "+.Inf", "+.INF":
value = "+Inf"
case "-.inf", "-.Inf", "-.INF":
value = "-Inf"
case ".nan", ".NaN", ".NAN":
value = "NaN"
default:
var info literal.NumInfo
// We make the assumption that any valid YAML float literal will be a valid
// CUE float literal as well, with the only exception of Inf/NaN above.
// Note that `!!float 123` is allowed.
if err := literal.ParseNum(value, &info); err != nil {
return nil, d.posErrorf(yn, "cannot decode %q as %s: %v", value, tag, err)
}
// If the decoded YAML scalar was explicitly or implicitly a float,
// and the scalar literal looks like an integer,
// unify it with "number" to record the fact that it was represented as a float.
// Don't unify with float, as `float & 123` is invalid, and there's no need
// to forbid representing the number as an integer either.
if yn.Tag != "" {
if p := strings.IndexAny(value, ".eEiInN"); p == -1 {
// TODO: number(v) when we have conversions
// TODO(mvdan): don't shove the unification inside a BasicLit.Value string
//
// TODO(mvdan): would it be better to do turn `!!float 123` into `123.0`
// rather than `number & 123`? Note that `float & 123` is an error.
value = fmt.Sprintf("number & %s", value)
}
}
}
return d.makeNum(yn, value, token.FLOAT), nil
case nullTag:
return &ast.BasicLit{
ValuePos: d.pos(yn).WithRel(token.Blank),
Kind: token.NULL,
Value: "null",
}, nil
default:
return nil, d.posErrorf(yn, "cannot unmarshal tag %q", tag)
}
}
func (d *decoder) makeNum(yn *yaml.Node, val string, kind token.Token) (expr ast.Expr) {
val, negative := strings.CutPrefix(val, "-")
expr = &ast.BasicLit{
ValuePos: d.pos(yn),
Kind: kind,
Value: val,
}
if negative {
expr = &ast.UnaryExpr{
OpPos: d.pos(yn),
Op: token.SUB,
X: expr,
}
}
return expr
}
func (d *decoder) alias(yn *yaml.Node) (ast.Expr, error) {
if d.extractingAliases[yn] {
// TODO this could actually be allowed in some circumstances.
return nil, d.posErrorf(yn, "anchor %q value contains itself", yn.Value)
}
if d.extractingAliases == nil {
d.extractingAliases = make(map[*yaml.Node]bool)
}
d.extractingAliases[yn] = true
var node ast.Expr
node, err := d.extract(yn.Alias)
delete(d.extractingAliases, yn)
return node, err
}
func labelStr(l ast.Label) string {
switch l := l.(type) {
case *ast.Ident:
return l.Name
case *ast.BasicLit:
s, _ := literal.Unquote(l.Value)
return s
}
return ""
}
func isMerge(yn *yaml.Node) bool {
// TODO(mvdan): The boolean logic below is from yaml.v3; should we be relying on upstream somehow?
return yn.Kind == yaml.ScalarNode && yn.Value == "<<" && (yn.Tag == "" || yn.Tag == "!" || yn.ShortTag() == mergeTag)
}
|