Michael Smith fd074a5643
fix: improve logic for existing README validation (#325)
Addresses part of https://github.com/coder/registry/issues/194

## Description

This PR beefs up the validation for the validation logic that we already
had in place. This PR does not include adding validation for templates
(which will be addressed in a second PR).

### Changes made
- Added logic to reject unknown frontmatter fields for modules and
contributor profile README files
- Added logic to handle frontmatter fields that were previously missed
in validation steps (GitHub username for contributors and Operating
Systems for modules)
- Updated a few comments (added some new comments, formatted existing
comments to meet 100-column width)

### Type of Change

- [ ] New module
- [x] Bug fix
- [ ] Feature/enhancement
- [ ] Documentation
- [x] Other

## Testing & Validation

- [x] Tests pass (`bun test`)
- [x] Code formatted (`bun run fmt`)
- [x] Changes tested locally
2025-08-13 11:07:08 -05:00

196 lines
6.6 KiB
Go
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package main
import (
"bufio"
"fmt"
"regexp"
"slices"
"strings"
"golang.org/x/xerrors"
)
// validationPhase represents a specific phase during README validation. It is expected that each phase is discrete, and
// errors during one will prevent a future phase from starting.
type validationPhase string
const (
rootRegistryPath = "./registry"
// --- validationPhases ---
// validationPhaseStructure indicates when the entire Registry
// directory is being verified for having all files be placed in the file
// system as expected.
validationPhaseStructure validationPhase = "File structure validation"
// ValidationPhaseFile indicates when README files are being read from
// the file system.
validationPhaseFile validationPhase = "Filesystem reading"
// ValidationPhaseReadme indicates when a README's frontmatter is
// being parsed as YAML. This phase does not include YAML validation.
validationPhaseReadme validationPhase = "README parsing"
// ValidationPhaseCrossReference indicates when a README's frontmatter
// is having all its relative URLs be validated for whether they point to
// valid resources.
validationPhaseCrossReference validationPhase = "Cross-referencing relative asset URLs"
// --- end of validationPhases ---.
)
var (
supportedAvatarFileFormats = []string{".png", ".jpeg", ".jpg", ".gif", ".svg"}
// Matches markdown headers placed at the beginning of a line (e.g., "# " or "### "). To make the logic for
// validateReadmeBody easier, this pattern deliberately matches on invalid headers (header levels must be in the
// range 16 to be valid). The function has checks to see if the level is correct.
readmeHeaderRe = regexp.MustCompile(`^(#+)(\s*)`)
)
// readme represents a single README file within the repo (usually within the top-level "/registry" directory).
type readme struct {
filePath string
rawText string
}
// separateFrontmatter attempts to separate a README file's frontmatter content from the main README body, returning
// both values in that order. It does not validate whether the structure of the frontmatter is valid (i.e., that it's
// structured as YAML).
func separateFrontmatter(readmeText string) (readmeFrontmatter string, readmeBody string, err error) {
if readmeText == "" {
return "", "", xerrors.New("README is empty")
}
const fence = "---"
var fm strings.Builder
var body strings.Builder
fenceCount := 0
lineScanner := bufio.NewScanner(strings.NewReader(strings.TrimSpace(readmeText)))
for lineScanner.Scan() {
nextLine := lineScanner.Text()
if fenceCount < 2 && nextLine == fence {
fenceCount++
continue
}
// Break early if the very first line wasn't a fence, because then we know for certain that the README has problems.
if fenceCount == 0 {
break
}
// It should be safe to trim each line of the frontmatter on a per-line basis, because there shouldn't be any
// extra meaning attached to the indentation. The same does NOT apply to the README; best we can do is gather
// all the lines and then trim around it.
if inReadmeBody := fenceCount >= 2; inReadmeBody {
fmt.Fprintf(&body, "%s\n", nextLine)
} else {
fmt.Fprintf(&fm, "%s\n", strings.TrimSpace(nextLine))
}
}
if fenceCount < 2 {
return "", "", xerrors.New("README does not have two sets of frontmatter fences")
}
if fm.Len() == 0 {
return "", "", xerrors.New("readme has frontmatter fences but no frontmatter content")
}
return fm.String(), strings.TrimSpace(body.String()), nil
}
// TODO: This seems to work okay for now, but the really proper way of doing this is by parsing this as an AST, and then
// checking the resulting nodes.
func validateReadmeBody(body string) []error {
trimmed := strings.TrimSpace(body)
if trimmed == "" {
return []error{xerrors.New("README body is empty")}
}
// If the very first line of the README doesn't start with an ATX-style H1 header, there's a risk that the rest of the
// validation logic will break, since we don't have many guarantees about how the README is actually structured.
if !strings.HasPrefix(trimmed, "# ") {
return []error{xerrors.New("README body must start with ATX-style h1 header (i.e., \"# \")")}
}
var errs []error
latestHeaderLevel := 0
foundFirstH1 := false
isInCodeBlock := false
lineScanner := bufio.NewScanner(strings.NewReader(trimmed))
for lineScanner.Scan() {
nextLine := lineScanner.Text()
// Have to check this because a lot of programming languages support # comments (including Terraform), and
// without any context, there's no way to tell the difference between a markdown header and code comment.
if strings.HasPrefix(nextLine, "```") {
isInCodeBlock = !isInCodeBlock
continue
}
if isInCodeBlock {
continue
}
headerGroups := readmeHeaderRe.FindStringSubmatch(nextLine)
if headerGroups == nil {
continue
}
// In the Markdown spec it is mandatory to have a space following the header # symbol(s).
if headerGroups[2] == "" {
errs = append(errs, xerrors.New("header does not have space between header characters and main header text"))
}
nextHeaderLevel := len(headerGroups[1])
if nextHeaderLevel == 1 && !foundFirstH1 {
foundFirstH1 = true
latestHeaderLevel = 1
continue
}
// If we have obviously invalid headers, it's not really safe to keep proceeding with the rest of the content.
if nextHeaderLevel == 1 {
errs = append(errs, xerrors.New("READMEs cannot contain more than h1 header"))
break
}
if nextHeaderLevel > 6 {
errs = append(errs, xerrors.Errorf("README/HTML files cannot have headers exceed level 6 (found level %d)", nextHeaderLevel))
break
}
// This is something we need to enforce for accessibility, not just for the Registry website, but also when
// users are viewing the README files in the GitHub web view.
if nextHeaderLevel > latestHeaderLevel && nextHeaderLevel != (latestHeaderLevel+1) {
errs = append(errs, xerrors.New("headers are not allowed to increase more than 1 level at a time"))
continue
}
// As long as the above condition passes, there's no problems with going up a header level or going down 1+ header levels.
latestHeaderLevel = nextHeaderLevel
}
return errs
}
func validateFrontmatterYamlKeys(frontmatter string, allowedKeys []string) []error {
if len(allowedKeys) == 0 {
return []error{xerrors.New("Set of allowed keys is empty")}
}
var key string
var cutOk bool
var line string
var errs []error
lineScanner := bufio.NewScanner(strings.NewReader(frontmatter))
for lineScanner.Scan() {
line = lineScanner.Text()
key, _, cutOk = strings.Cut(line, ":")
if !cutOk || slices.Contains(allowedKeys, key) {
continue
}
errs = append(errs, xerrors.Errorf("detected unknown key %q", key))
}
return errs
}