~bigbes/confluence-md-utilities (v0.1.0): cmd/mdcx/verify.go

package main

import (
	"fmt"
	"io"
	"os"
	"regexp"
	"strings"

	"github.com/spf13/cobra"

	"go.bigb.es/confluence-md-utilities/converter"
	"go.bigb.es/confluence-md-utilities/format"
)

var (
	verifyIndent string
)

var verifyCmd = &cobra.Command{
	Use:   "verify [input.xml]",
	Short: "Verify round-trip fidelity of XML → Markdown → XML conversion",
	Long: `Check that Confluence XML survives a round-trip through Markdown and back.

Compares:
  A = fmt(input XML)
  B = fmt(xml2md(input XML) → md2xml → XML)

If A and B match, the round-trip is lossless. Otherwise, prints a diff.

Reads from stdin if no file is specified.`,
	Args: cobra.MaximumNArgs(1),
	RunE: func(cmd *cobra.Command, args []string) error {
		var input []byte
		var err error

		if len(args) > 0 {
			input, err = os.ReadFile(args[0])
		} else {
			input, err = io.ReadAll(os.Stdin)
		}
		if err != nil {
			return fmt.Errorf("reading input: %w", err)
		}

		xmlInput := string(input)

		// Normalize input: remove elements that cannot survive round-trip
		xmlInput = normalizeForVerify(xmlInput)

		// A: format the original XML
		formatted := format.PrettyXML(xmlInput, verifyIndent)

		// B: XML → Markdown → XML → format
		md, err := converter.ConfluenceToMarkdown(xmlInput)
		if err != nil {
			return fmt.Errorf("xml→markdown: %w", err)
		}

		xmlRoundTrip, err := converter.MarkdownToConfluence([]byte(md))
		if err != nil {
			return fmt.Errorf("markdown→xml: %w", err)
		}

		formattedRoundTrip := format.PrettyXML(xmlRoundTrip, verifyIndent)

		if formatted == formattedRoundTrip {
			fmt.Fprintln(os.Stderr, "OK: round-trip is lossless")
			return nil
		}

		// Print unified diff with colored inline highlights
		linesA := strings.Split(formatted, "\n")
		linesB := strings.Split(formattedRoundTrip, "\n")

		fmt.Fprintln(os.Stderr, "MISMATCH: round-trip produced different output")
		fmt.Fprintln(os.Stderr, "")
		fmt.Fprintf(os.Stderr, "%s--- original (formatted)%s\n", ansiRed, ansiReset)
		fmt.Fprintf(os.Stderr, "%s+++ round-trip (formatted)%s\n", ansiGreen, ansiReset)

		ops := computeDiffOps(linesA, linesB)
		hunks := buildHunks(ops, 3)
		for _, h := range hunks {
			printHunk(h)
		}

		os.Exit(1)
		return nil
	},
}

func init() {
	verifyCmd.Flags().StringVar(&verifyIndent, "indent", "  ", "Indentation string (default: 2 spaces)")
	rootCmd.AddCommand(verifyCmd)
}

var (
	// reEmptyParagraph matches empty paragraphs like <p><br /></p>, <p><br/></p>, etc.
	reEmptyParagraph = regexp.MustCompile(`<p>\s*<br\s*/?>\s*</p>`)
	// reSpanInCode matches <span...>...</span> inside <code>, unwrapping to just the text.
	reSpanInCode = regexp.MustCompile(`(<code>[^<]*)<span[^>]*>([^<]*)</span>`)
	// reAdjacentCode matches </code><code> (directly adjacent), merging into one span.
	reAdjacentCode = regexp.MustCompile(`</code><code>`)
)

// normalizeForVerify strips XML patterns that cannot survive a round-trip
// through Markdown, so verify compares only what the converter can preserve.
func normalizeForVerify(xml string) string {
	xml = reEmptyParagraph.ReplaceAllString(xml, "")
	// Unwrap <span> inside <code> (apply repeatedly for nested cases)
	for reSpanInCode.MatchString(xml) {
		xml = reSpanInCode.ReplaceAllString(xml, "${1}${2}")
	}
	// Merge adjacent <code> elements
	xml = reAdjacentCode.ReplaceAllString(xml, "")
	return xml
}

// ANSI escape codes for diff output.
const (
	ansiReset  = "\033[0m"
	ansiRed    = "\033[31m"
	ansiGreen  = "\033[32m"
	ansiCyan   = "\033[36m"
	ansiBold   = "\033[1m"
	ansiRedBg  = "\033[41;37m" // red background, white text
	ansiGrnBg  = "\033[42;30m" // green background, black text
)

// diffOp represents a line-level diff operation.
type diffOp int

const (
	opEqual  diffOp = iota
	opRemove        // line only in A
	opAdd           // line only in B
)

// diffLine is a single line in the diff with its operation and source positions.
type diffLine struct {
	op    diffOp
	text  string
	lineA int // 1-based line number in A (-1 if not applicable)
	lineB int // 1-based line number in B (-1 if not applicable)
}

// hunk is a group of diff lines with surrounding context.
type hunk struct {
	startA, countA int // 1-based start and count for A
	startB, countB int // 1-based start and count for B
	lines          []diffLine
}

// computeDiffOps produces a sequence of diff operations from two line slices
// using LCS-based algorithm.
func computeDiffOps(a, b []string) []diffLine {
	m, n := len(a), len(b)
	dp := make([][]int, m+1)
	for i := range dp {
		dp[i] = make([]int, n+1)
	}
	for i := 1; i <= m; i++ {
		for j := 1; j <= n; j++ {
			if a[i-1] == b[j-1] {
				dp[i][j] = dp[i-1][j-1] + 1
			} else if dp[i-1][j] >= dp[i][j-1] {
				dp[i][j] = dp[i-1][j]
			} else {
				dp[i][j] = dp[i][j-1]
			}
		}
	}

	// Backtrack to produce operations
	var ops []diffLine
	i, j := m, n
	for i > 0 || j > 0 {
		if i > 0 && j > 0 && a[i-1] == b[j-1] {
			ops = append(ops, diffLine{op: opEqual, text: a[i-1], lineA: i, lineB: j})
			i--
			j--
		} else if j > 0 && (i == 0 || dp[i][j-1] >= dp[i-1][j]) {
			ops = append(ops, diffLine{op: opAdd, text: b[j-1], lineA: -1, lineB: j})
			j--
		} else {
			ops = append(ops, diffLine{op: opRemove, text: a[i-1], lineA: i, lineB: -1})
			i--
		}
	}
	// Reverse — we built it backwards
	for l, r := 0, len(ops)-1; l < r; l, r = l+1, r-1 {
		ops[l], ops[r] = ops[r], ops[l]
	}
	return ops
}

// buildHunks groups diff operations into unified-diff hunks with `ctx` context lines.
func buildHunks(ops []diffLine, ctx int) []hunk {
	// Find ranges of changed lines, expanded by context
	type span struct{ start, end int } // indices into ops
	var changed []span
	for i, op := range ops {
		if op.op != opEqual {
			if len(changed) > 0 && i-changed[len(changed)-1].end <= 2*ctx {
				// Merge with previous span
				changed[len(changed)-1].end = i + 1
			} else {
				changed = append(changed, span{i, i + 1})
			}
		}
	}

	var hunks []hunk
	for _, ch := range changed {
		lo := ch.start - ctx
		if lo < 0 {
			lo = 0
		}
		hi := ch.end + ctx
		if hi > len(ops) {
			hi = len(ops)
		}

		h := hunk{lines: ops[lo:hi]}

		// Compute start lines and counts
		h.startA, h.startB = 1, 1
		if len(h.lines) > 0 {
			// Find first valid line numbers
			for _, dl := range h.lines {
				if dl.lineA > 0 {
					h.startA = dl.lineA
					break
				}
				if dl.lineB > 0 {
					h.startB = dl.lineB
					break
				}
			}
			if h.lines[0].lineA > 0 {
				h.startA = h.lines[0].lineA
			}
			if h.lines[0].lineB > 0 {
				h.startB = h.lines[0].lineB
			}
		}
		for _, dl := range h.lines {
			if dl.op == opEqual || dl.op == opRemove {
				h.countA++
			}
			if dl.op == opEqual || dl.op == opAdd {
				h.countB++
			}
		}
		hunks = append(hunks, h)
	}
	return hunks
}

// printHunk outputs a single unified diff hunk with ANSI colors and inline highlights.
func printHunk(h hunk) {
	// @@ header
	fmt.Fprintf(os.Stdout, "%s@@ -%d,%d +%d,%d @@%s\n",
		ansiCyan, h.startA, h.countA, h.startB, h.countB, ansiReset)

	lines := h.lines

	for i := 0; i < len(lines); i++ {
		dl := lines[i]
		switch dl.op {
		case opEqual:
			fmt.Printf(" %s\n", dl.text)

		case opRemove:
			// Try to pair with subsequent add(s) for inline highlighting
			remStart := i
			for i+1 < len(lines) && lines[i+1].op == opRemove {
				i++
			}
			remEnd := i + 1
			addStart := remEnd
			j := addStart
			for j < len(lines) && lines[j].op == opAdd {
				j++
			}
			addEnd := j

			removed := lines[remStart:remEnd]
			added := lines[addStart:addEnd]

			// Pair up removed/added lines for inline diff
			pairs := min(len(removed), len(added))
			for p := range pairs {
				hl, hr := inlineHighlight(removed[p].text, added[p].text)
				fmt.Printf("%s-%s%s\n", ansiRed, hl, ansiReset)
				fmt.Printf("%s+%s%s\n", ansiGreen, hr, ansiReset)
			}
			// Remaining unpaired lines
			for p := pairs; p < len(removed); p++ {
				fmt.Printf("%s-%s%s\n", ansiRed, removed[p].text, ansiReset)
			}
			for p := pairs; p < len(added); p++ {
				fmt.Printf("%s+%s%s\n", ansiGreen, added[p].text, ansiReset)
			}

			i = addEnd - 1 // -1 because loop increments

		case opAdd:
			// Unpaired add (not preceded by remove)
			fmt.Printf("%s+%s%s\n", ansiGreen, dl.text, ansiReset)
		}
	}
}

// inlineHighlight returns two strings (for removed and added lines) with ANSI
// bold marking on the parts that actually differ.
func inlineHighlight(a, b string) (string, string) {
	ra := []rune(a)
	rb := []rune(b)

	// Common prefix
	pfx := 0
	for pfx < len(ra) && pfx < len(rb) && ra[pfx] == rb[pfx] {
		pfx++
	}
	// Common suffix (from the end, but don't overlap with prefix)
	sfx := 0
	for sfx < len(ra)-pfx && sfx < len(rb)-pfx && ra[len(ra)-1-sfx] == rb[len(rb)-1-sfx] {
		sfx++
	}

	midA := ra[pfx : len(ra)-sfx]
	midB := rb[pfx : len(rb)-sfx]

	if len(midA) == 0 && len(midB) == 0 {
		// Lines are identical — no highlighting needed
		return a, b
	}

	prefix := string(ra[:pfx])
	suffix := string(ra[len(ra)-sfx:])

	hlA := prefix + ansiBold + ansiRedBg + string(midA) + ansiReset + ansiRed + suffix
	hlB := prefix + ansiBold + ansiGrnBg + string(midB) + ansiReset + ansiGreen + suffix

	return hlA, hlB
}