~bigbes/confluence-md-utilities

ref: v0.1.0 confluence-md-utilities/cmd/mdcx/verify.go -rw-r--r-- 8.9 KiB
e0e81bc6 — Eugene Blikh chore: rename module to go.bigb.es/confluence-md-utilities a month ago
                                                                                
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
package main

import (
	"fmt"
	"io"
	"os"
	"regexp"
	"strings"

	"github.com/spf13/cobra"

	"go.bigb.es/confluence-md-utilities/converter"
	"go.bigb.es/confluence-md-utilities/format"
)

var (
	verifyIndent string
)

var verifyCmd = &cobra.Command{
	Use:   "verify [input.xml]",
	Short: "Verify round-trip fidelity of XML → Markdown → XML conversion",
	Long: `Check that Confluence XML survives a round-trip through Markdown and back.

Compares:
  A = fmt(input XML)
  B = fmt(xml2md(input XML) → md2xml → XML)

If A and B match, the round-trip is lossless. Otherwise, prints a diff.

Reads from stdin if no file is specified.`,
	Args: cobra.MaximumNArgs(1),
	RunE: func(cmd *cobra.Command, args []string) error {
		var input []byte
		var err error

		if len(args) > 0 {
			input, err = os.ReadFile(args[0])
		} else {
			input, err = io.ReadAll(os.Stdin)
		}
		if err != nil {
			return fmt.Errorf("reading input: %w", err)
		}

		xmlInput := string(input)

		// Normalize input: remove elements that cannot survive round-trip
		xmlInput = normalizeForVerify(xmlInput)

		// A: format the original XML
		formatted := format.PrettyXML(xmlInput, verifyIndent)

		// B: XML → Markdown → XML → format
		md, err := converter.ConfluenceToMarkdown(xmlInput)
		if err != nil {
			return fmt.Errorf("xml→markdown: %w", err)
		}

		xmlRoundTrip, err := converter.MarkdownToConfluence([]byte(md))
		if err != nil {
			return fmt.Errorf("markdown→xml: %w", err)
		}

		formattedRoundTrip := format.PrettyXML(xmlRoundTrip, verifyIndent)

		if formatted == formattedRoundTrip {
			fmt.Fprintln(os.Stderr, "OK: round-trip is lossless")
			return nil
		}

		// Print unified diff with colored inline highlights
		linesA := strings.Split(formatted, "\n")
		linesB := strings.Split(formattedRoundTrip, "\n")

		fmt.Fprintln(os.Stderr, "MISMATCH: round-trip produced different output")
		fmt.Fprintln(os.Stderr, "")
		fmt.Fprintf(os.Stderr, "%s--- original (formatted)%s\n", ansiRed, ansiReset)
		fmt.Fprintf(os.Stderr, "%s+++ round-trip (formatted)%s\n", ansiGreen, ansiReset)

		ops := computeDiffOps(linesA, linesB)
		hunks := buildHunks(ops, 3)
		for _, h := range hunks {
			printHunk(h)
		}

		os.Exit(1)
		return nil
	},
}

func init() {
	verifyCmd.Flags().StringVar(&verifyIndent, "indent", "  ", "Indentation string (default: 2 spaces)")
	rootCmd.AddCommand(verifyCmd)
}

var (
	// reEmptyParagraph matches empty paragraphs like <p><br /></p>, <p><br/></p>, etc.
	reEmptyParagraph = regexp.MustCompile(`<p>\s*<br\s*/?>\s*</p>`)
	// reSpanInCode matches <span...>...</span> inside <code>, unwrapping to just the text.
	reSpanInCode = regexp.MustCompile(`(<code>[^<]*)<span[^>]*>([^<]*)</span>`)
	// reAdjacentCode matches </code><code> (directly adjacent), merging into one span.
	reAdjacentCode = regexp.MustCompile(`</code><code>`)
)

// normalizeForVerify strips XML patterns that cannot survive a round-trip
// through Markdown, so verify compares only what the converter can preserve.
func normalizeForVerify(xml string) string {
	xml = reEmptyParagraph.ReplaceAllString(xml, "")
	// Unwrap <span> inside <code> (apply repeatedly for nested cases)
	for reSpanInCode.MatchString(xml) {
		xml = reSpanInCode.ReplaceAllString(xml, "${1}${2}")
	}
	// Merge adjacent <code> elements
	xml = reAdjacentCode.ReplaceAllString(xml, "")
	return xml
}

// ANSI escape codes for diff output.
const (
	ansiReset  = "\033[0m"
	ansiRed    = "\033[31m"
	ansiGreen  = "\033[32m"
	ansiCyan   = "\033[36m"
	ansiBold   = "\033[1m"
	ansiRedBg  = "\033[41;37m" // red background, white text
	ansiGrnBg  = "\033[42;30m" // green background, black text
)

// diffOp represents a line-level diff operation.
type diffOp int

const (
	opEqual  diffOp = iota
	opRemove        // line only in A
	opAdd           // line only in B
)

// diffLine is a single line in the diff with its operation and source positions.
type diffLine struct {
	op    diffOp
	text  string
	lineA int // 1-based line number in A (-1 if not applicable)
	lineB int // 1-based line number in B (-1 if not applicable)
}

// hunk is a group of diff lines with surrounding context.
type hunk struct {
	startA, countA int // 1-based start and count for A
	startB, countB int // 1-based start and count for B
	lines          []diffLine
}

// computeDiffOps produces a sequence of diff operations from two line slices
// using LCS-based algorithm.
func computeDiffOps(a, b []string) []diffLine {
	m, n := len(a), len(b)
	dp := make([][]int, m+1)
	for i := range dp {
		dp[i] = make([]int, n+1)
	}
	for i := 1; i <= m; i++ {
		for j := 1; j <= n; j++ {
			if a[i-1] == b[j-1] {
				dp[i][j] = dp[i-1][j-1] + 1
			} else if dp[i-1][j] >= dp[i][j-1] {
				dp[i][j] = dp[i-1][j]
			} else {
				dp[i][j] = dp[i][j-1]
			}
		}
	}

	// Backtrack to produce operations
	var ops []diffLine
	i, j := m, n
	for i > 0 || j > 0 {
		if i > 0 && j > 0 && a[i-1] == b[j-1] {
			ops = append(ops, diffLine{op: opEqual, text: a[i-1], lineA: i, lineB: j})
			i--
			j--
		} else if j > 0 && (i == 0 || dp[i][j-1] >= dp[i-1][j]) {
			ops = append(ops, diffLine{op: opAdd, text: b[j-1], lineA: -1, lineB: j})
			j--
		} else {
			ops = append(ops, diffLine{op: opRemove, text: a[i-1], lineA: i, lineB: -1})
			i--
		}
	}
	// Reverse — we built it backwards
	for l, r := 0, len(ops)-1; l < r; l, r = l+1, r-1 {
		ops[l], ops[r] = ops[r], ops[l]
	}
	return ops
}

// buildHunks groups diff operations into unified-diff hunks with `ctx` context lines.
func buildHunks(ops []diffLine, ctx int) []hunk {
	// Find ranges of changed lines, expanded by context
	type span struct{ start, end int } // indices into ops
	var changed []span
	for i, op := range ops {
		if op.op != opEqual {
			if len(changed) > 0 && i-changed[len(changed)-1].end <= 2*ctx {
				// Merge with previous span
				changed[len(changed)-1].end = i + 1
			} else {
				changed = append(changed, span{i, i + 1})
			}
		}
	}

	var hunks []hunk
	for _, ch := range changed {
		lo := ch.start - ctx
		if lo < 0 {
			lo = 0
		}
		hi := ch.end + ctx
		if hi > len(ops) {
			hi = len(ops)
		}

		h := hunk{lines: ops[lo:hi]}

		// Compute start lines and counts
		h.startA, h.startB = 1, 1
		if len(h.lines) > 0 {
			// Find first valid line numbers
			for _, dl := range h.lines {
				if dl.lineA > 0 {
					h.startA = dl.lineA
					break
				}
				if dl.lineB > 0 {
					h.startB = dl.lineB
					break
				}
			}
			if h.lines[0].lineA > 0 {
				h.startA = h.lines[0].lineA
			}
			if h.lines[0].lineB > 0 {
				h.startB = h.lines[0].lineB
			}
		}
		for _, dl := range h.lines {
			if dl.op == opEqual || dl.op == opRemove {
				h.countA++
			}
			if dl.op == opEqual || dl.op == opAdd {
				h.countB++
			}
		}
		hunks = append(hunks, h)
	}
	return hunks
}

// printHunk outputs a single unified diff hunk with ANSI colors and inline highlights.
func printHunk(h hunk) {
	// @@ header
	fmt.Fprintf(os.Stdout, "%s@@ -%d,%d +%d,%d @@%s\n",
		ansiCyan, h.startA, h.countA, h.startB, h.countB, ansiReset)

	lines := h.lines

	for i := 0; i < len(lines); i++ {
		dl := lines[i]
		switch dl.op {
		case opEqual:
			fmt.Printf(" %s\n", dl.text)

		case opRemove:
			// Try to pair with subsequent add(s) for inline highlighting
			remStart := i
			for i+1 < len(lines) && lines[i+1].op == opRemove {
				i++
			}
			remEnd := i + 1
			addStart := remEnd
			j := addStart
			for j < len(lines) && lines[j].op == opAdd {
				j++
			}
			addEnd := j

			removed := lines[remStart:remEnd]
			added := lines[addStart:addEnd]

			// Pair up removed/added lines for inline diff
			pairs := min(len(removed), len(added))
			for p := range pairs {
				hl, hr := inlineHighlight(removed[p].text, added[p].text)
				fmt.Printf("%s-%s%s\n", ansiRed, hl, ansiReset)
				fmt.Printf("%s+%s%s\n", ansiGreen, hr, ansiReset)
			}
			// Remaining unpaired lines
			for p := pairs; p < len(removed); p++ {
				fmt.Printf("%s-%s%s\n", ansiRed, removed[p].text, ansiReset)
			}
			for p := pairs; p < len(added); p++ {
				fmt.Printf("%s+%s%s\n", ansiGreen, added[p].text, ansiReset)
			}

			i = addEnd - 1 // -1 because loop increments

		case opAdd:
			// Unpaired add (not preceded by remove)
			fmt.Printf("%s+%s%s\n", ansiGreen, dl.text, ansiReset)
		}
	}
}

// inlineHighlight returns two strings (for removed and added lines) with ANSI
// bold marking on the parts that actually differ.
func inlineHighlight(a, b string) (string, string) {
	ra := []rune(a)
	rb := []rune(b)

	// Common prefix
	pfx := 0
	for pfx < len(ra) && pfx < len(rb) && ra[pfx] == rb[pfx] {
		pfx++
	}
	// Common suffix (from the end, but don't overlap with prefix)
	sfx := 0
	for sfx < len(ra)-pfx && sfx < len(rb)-pfx && ra[len(ra)-1-sfx] == rb[len(rb)-1-sfx] {
		sfx++
	}

	midA := ra[pfx : len(ra)-sfx]
	midB := rb[pfx : len(rb)-sfx]

	if len(midA) == 0 && len(midB) == 0 {
		// Lines are identical — no highlighting needed
		return a, b
	}

	prefix := string(ra[:pfx])
	suffix := string(ra[len(ra)-sfx:])

	hlA := prefix + ansiBold + ansiRedBg + string(midA) + ansiReset + ansiRed + suffix
	hlB := prefix + ansiBold + ansiGrnBg + string(midB) + ansiReset + ansiGreen + suffix

	return hlA, hlB
}