Addresses part of https://github.com/coder/registry/issues/194 ## Description This PR beefs up the validation for the validation logic that we already had in place. This PR does not include adding validation for templates (which will be addressed in a second PR). ### Changes made - Added logic to reject unknown frontmatter fields for modules and contributor profile README files - Added logic to handle frontmatter fields that were previously missed in validation steps (GitHub username for contributors and Operating Systems for modules) - Updated a few comments (added some new comments, formatted existing comments to meet 100-column width) ### Type of Change - [ ] New module - [x] Bug fix - [ ] Feature/enhancement - [ ] Documentation - [x] Other ## Testing & Validation - [x] Tests pass (`bun test`) - [x] Code formatted (`bun run fmt`) - [x] Changes tested locally
196 lines
6.6 KiB
Go
196 lines
6.6 KiB
Go
package main
|
||
|
||
import (
|
||
"bufio"
|
||
"fmt"
|
||
"regexp"
|
||
"slices"
|
||
"strings"
|
||
|
||
"golang.org/x/xerrors"
|
||
)
|
||
|
||
// validationPhase represents a specific phase during README validation. It is expected that each phase is discrete, and
|
||
// errors during one will prevent a future phase from starting.
|
||
type validationPhase string
|
||
|
||
const (
|
||
rootRegistryPath = "./registry"
|
||
|
||
// --- validationPhases ---
|
||
// validationPhaseStructure indicates when the entire Registry
|
||
// directory is being verified for having all files be placed in the file
|
||
// system as expected.
|
||
validationPhaseStructure validationPhase = "File structure validation"
|
||
|
||
// ValidationPhaseFile indicates when README files are being read from
|
||
// the file system.
|
||
validationPhaseFile validationPhase = "Filesystem reading"
|
||
|
||
// ValidationPhaseReadme indicates when a README's frontmatter is
|
||
// being parsed as YAML. This phase does not include YAML validation.
|
||
validationPhaseReadme validationPhase = "README parsing"
|
||
|
||
// ValidationPhaseCrossReference indicates when a README's frontmatter
|
||
// is having all its relative URLs be validated for whether they point to
|
||
// valid resources.
|
||
validationPhaseCrossReference validationPhase = "Cross-referencing relative asset URLs"
|
||
// --- end of validationPhases ---.
|
||
)
|
||
|
||
var (
|
||
supportedAvatarFileFormats = []string{".png", ".jpeg", ".jpg", ".gif", ".svg"}
|
||
// Matches markdown headers placed at the beginning of a line (e.g., "# " or "### "). To make the logic for
|
||
// validateReadmeBody easier, this pattern deliberately matches on invalid headers (header levels must be in the
|
||
// range 1–6 to be valid). The function has checks to see if the level is correct.
|
||
readmeHeaderRe = regexp.MustCompile(`^(#+)(\s*)`)
|
||
)
|
||
|
||
// readme represents a single README file within the repo (usually within the top-level "/registry" directory).
|
||
type readme struct {
|
||
filePath string
|
||
rawText string
|
||
}
|
||
|
||
// separateFrontmatter attempts to separate a README file's frontmatter content from the main README body, returning
|
||
// both values in that order. It does not validate whether the structure of the frontmatter is valid (i.e., that it's
|
||
// structured as YAML).
|
||
func separateFrontmatter(readmeText string) (readmeFrontmatter string, readmeBody string, err error) {
|
||
if readmeText == "" {
|
||
return "", "", xerrors.New("README is empty")
|
||
}
|
||
|
||
const fence = "---"
|
||
|
||
var fm strings.Builder
|
||
var body strings.Builder
|
||
fenceCount := 0
|
||
|
||
lineScanner := bufio.NewScanner(strings.NewReader(strings.TrimSpace(readmeText)))
|
||
for lineScanner.Scan() {
|
||
nextLine := lineScanner.Text()
|
||
if fenceCount < 2 && nextLine == fence {
|
||
fenceCount++
|
||
continue
|
||
}
|
||
// Break early if the very first line wasn't a fence, because then we know for certain that the README has problems.
|
||
if fenceCount == 0 {
|
||
break
|
||
}
|
||
|
||
// It should be safe to trim each line of the frontmatter on a per-line basis, because there shouldn't be any
|
||
// extra meaning attached to the indentation. The same does NOT apply to the README; best we can do is gather
|
||
// all the lines and then trim around it.
|
||
if inReadmeBody := fenceCount >= 2; inReadmeBody {
|
||
fmt.Fprintf(&body, "%s\n", nextLine)
|
||
} else {
|
||
fmt.Fprintf(&fm, "%s\n", strings.TrimSpace(nextLine))
|
||
}
|
||
}
|
||
if fenceCount < 2 {
|
||
return "", "", xerrors.New("README does not have two sets of frontmatter fences")
|
||
}
|
||
if fm.Len() == 0 {
|
||
return "", "", xerrors.New("readme has frontmatter fences but no frontmatter content")
|
||
}
|
||
|
||
return fm.String(), strings.TrimSpace(body.String()), nil
|
||
}
|
||
|
||
// TODO: This seems to work okay for now, but the really proper way of doing this is by parsing this as an AST, and then
|
||
// checking the resulting nodes.
|
||
func validateReadmeBody(body string) []error {
|
||
trimmed := strings.TrimSpace(body)
|
||
|
||
if trimmed == "" {
|
||
return []error{xerrors.New("README body is empty")}
|
||
}
|
||
|
||
// If the very first line of the README doesn't start with an ATX-style H1 header, there's a risk that the rest of the
|
||
// validation logic will break, since we don't have many guarantees about how the README is actually structured.
|
||
if !strings.HasPrefix(trimmed, "# ") {
|
||
return []error{xerrors.New("README body must start with ATX-style h1 header (i.e., \"# \")")}
|
||
}
|
||
|
||
var errs []error
|
||
latestHeaderLevel := 0
|
||
foundFirstH1 := false
|
||
isInCodeBlock := false
|
||
|
||
lineScanner := bufio.NewScanner(strings.NewReader(trimmed))
|
||
for lineScanner.Scan() {
|
||
nextLine := lineScanner.Text()
|
||
|
||
// Have to check this because a lot of programming languages support # comments (including Terraform), and
|
||
// without any context, there's no way to tell the difference between a markdown header and code comment.
|
||
if strings.HasPrefix(nextLine, "```") {
|
||
isInCodeBlock = !isInCodeBlock
|
||
continue
|
||
}
|
||
if isInCodeBlock {
|
||
continue
|
||
}
|
||
|
||
headerGroups := readmeHeaderRe.FindStringSubmatch(nextLine)
|
||
if headerGroups == nil {
|
||
continue
|
||
}
|
||
|
||
// In the Markdown spec it is mandatory to have a space following the header # symbol(s).
|
||
if headerGroups[2] == "" {
|
||
errs = append(errs, xerrors.New("header does not have space between header characters and main header text"))
|
||
}
|
||
|
||
nextHeaderLevel := len(headerGroups[1])
|
||
if nextHeaderLevel == 1 && !foundFirstH1 {
|
||
foundFirstH1 = true
|
||
latestHeaderLevel = 1
|
||
continue
|
||
}
|
||
|
||
// If we have obviously invalid headers, it's not really safe to keep proceeding with the rest of the content.
|
||
if nextHeaderLevel == 1 {
|
||
errs = append(errs, xerrors.New("READMEs cannot contain more than h1 header"))
|
||
break
|
||
}
|
||
if nextHeaderLevel > 6 {
|
||
errs = append(errs, xerrors.Errorf("README/HTML files cannot have headers exceed level 6 (found level %d)", nextHeaderLevel))
|
||
break
|
||
}
|
||
|
||
// This is something we need to enforce for accessibility, not just for the Registry website, but also when
|
||
// users are viewing the README files in the GitHub web view.
|
||
if nextHeaderLevel > latestHeaderLevel && nextHeaderLevel != (latestHeaderLevel+1) {
|
||
errs = append(errs, xerrors.New("headers are not allowed to increase more than 1 level at a time"))
|
||
continue
|
||
}
|
||
|
||
// As long as the above condition passes, there's no problems with going up a header level or going down 1+ header levels.
|
||
latestHeaderLevel = nextHeaderLevel
|
||
}
|
||
|
||
return errs
|
||
}
|
||
|
||
func validateFrontmatterYamlKeys(frontmatter string, allowedKeys []string) []error {
|
||
if len(allowedKeys) == 0 {
|
||
return []error{xerrors.New("Set of allowed keys is empty")}
|
||
}
|
||
|
||
var key string
|
||
var cutOk bool
|
||
var line string
|
||
|
||
var errs []error
|
||
lineScanner := bufio.NewScanner(strings.NewReader(frontmatter))
|
||
for lineScanner.Scan() {
|
||
line = lineScanner.Text()
|
||
key, _, cutOk = strings.Cut(line, ":")
|
||
if !cutOk || slices.Contains(allowedKeys, key) {
|
||
continue
|
||
}
|
||
errs = append(errs, xerrors.Errorf("detected unknown key %q", key))
|
||
}
|
||
return errs
|
||
}
|