~bigbes/confluence-md-utilities (e0e81bc69b5622c2bd20261155fc36519a7621d8): format/pretty.go

package format

import (
	"strings"
	"unicode/utf8"
)

const defaultMaxLineWidth = 120

// Block elements get their own line and increase indentation for children.
var blockTags = map[string]bool{
	// Layout
	"ac:layout":         true,
	"ac:layout-section": true,
	"ac:layout-cell":    true,
	// Block content
	"p":  true,
	"h1": true, "h2": true, "h3": true, "h4": true, "h5": true, "h6": true,
	"div": true,
	// Lists
	"ul": true, "ol": true, "li": true,
	// Tables
	"table": true, "thead": true, "tbody": true, "colgroup": true,
	"tr": true, "th": true, "td": true,
	// Macros
	"ac:structured-macro": true,
	"ac:rich-text-body":   true,
	"ac:plain-text-body":  true,
	// Task lists
	"ac:task-list": true,
	"ac:task":      true,
	"ac:task-body": true,
}

// inlineableBlocks: block tags that prefer to stay on one line if short enough.
var inlineableBlocks = map[string]bool{
	"li": true, "th": true, "td": true,
	"h1": true, "h2": true, "h3": true, "h4": true, "h5": true, "h6": true,
	"ac:task-id": true, "ac:task-status": true,
}

// Pre elements: content inside is not reformatted.
var preTags = map[string]bool{
	"ac:plain-text-body": true,
}

// PrettyXML formats Confluence storage XML with sensible indentation.
func PrettyXML(input string, indent string) string {
	tokens := tokenize(input)
	var buf strings.Builder
	level := 0
	inPre := 0
	atLineStart := true

	i := 0
	for i < len(tokens) {
		tok := tokens[i]

		switch tok.kind {
		case tokenOpen:
			tagName := tok.tagName()
			if inPre > 0 {
				buf.WriteString(tok.raw)
				if preTags[tagName] {
					inPre++
				}
				i++
				continue
			}
			if preTags[tagName] {
				inPre++
				ensureIndentedLine(&buf, level, indent, &atLineStart)
				buf.WriteString(tok.raw)
				i++
				continue
			}
			if blockTags[tagName] {
				// Try to inline short blocks like <li>text</li>, <h1>Title</h1>
				if inlineableBlocks[tagName] {
					if inlined, skip := tryInlineBlock(tokens[i:], tagName); skip > 0 {
						ensureIndentedLine(&buf, level, indent, &atLineStart)
						buf.WriteString(inlined)
						buf.WriteString("\n")
						atLineStart = true
						i += skip
						continue
					}
				}
				ensureIndentedLine(&buf, level, indent, &atLineStart)
				buf.WriteString(tok.raw)
				buf.WriteString("\n")
				level++
				atLineStart = true
			} else {
				if atLineStart {
					writeIndentPrefix(&buf, level, indent)
					atLineStart = false
				}
				buf.WriteString(tok.raw)
			}

		case tokenClose:
			tagName := tok.tagName()
			if inPre > 0 {
				buf.WriteString(tok.raw)
				if preTags[tagName] {
					inPre--
				}
				i++
				continue
			}
			if blockTags[tagName] {
				level--
				if level < 0 {
					level = 0
				}
				if !atLineStart {
					buf.WriteString("\n")
				}
				writeIndentPrefix(&buf, level, indent)
				buf.WriteString(tok.raw)
				buf.WriteString("\n")
				atLineStart = true
			} else {
				buf.WriteString(tok.raw)
			}

		case tokenSelfClose:
			tagName := tok.tagName()
			if inPre > 0 {
				buf.WriteString(tok.raw)
				i++
				continue
			}
			if blockTags[tagName] || tagName == "hr" || tagName == "col" {
				ensureIndentedLine(&buf, level, indent, &atLineStart)
				buf.WriteString(tok.raw)
				buf.WriteString("\n")
				atLineStart = true
			} else {
				if atLineStart {
					writeIndentPrefix(&buf, level, indent)
					atLineStart = false
				}
				buf.WriteString(tok.raw)
			}

		case tokenText:
			if inPre > 0 {
				buf.WriteString(tok.raw)
				i++
				continue
			}
			text := collapseWS(tok.raw)
			if text == "" || text == " " {
				i++
				continue
			}
			if atLineStart {
				text = strings.TrimLeft(text, " ")
				if text == "" {
					i++
					continue
				}
				writeIndentPrefix(&buf, level, indent)
				atLineStart = false
			}
			buf.WriteString(text)

		case tokenCDATA, tokenComment:
			if inPre > 0 {
				buf.WriteString(tok.raw)
				i++
				continue
			}
			if atLineStart {
				writeIndentPrefix(&buf, level, indent)
				atLineStart = false
			}
			buf.WriteString(tok.raw)
		}

		i++
	}

	result := buf.String()
	// Post-process: clean up lines and wrap long ones
	lines := strings.Split(result, "\n")
	var final []string
	for _, line := range lines {
		line = strings.TrimRight(line, " \t")
		if runeWidth(line) > defaultMaxLineWidth {
			final = append(final, wrapLine(line, defaultMaxLineWidth)...)
		} else {
			final = append(final, line)
		}
	}
	return strings.TrimSpace(strings.Join(final, "\n")) + "\n"
}

// tryInlineBlock checks if the block starting at tokens[0] (an open tag) has
// only inline/text children and a matching close tag, and the total is short
// enough to fit on one line. Returns the inlined string and number of tokens consumed.
func tryInlineBlock(tokens []token, tagName string) (string, int) {
	if len(tokens) < 2 {
		return "", 0
	}
	// Scan forward to find matching close tag
	depth := 0
	var inner strings.Builder
	for j, tok := range tokens {
		if j == 0 {
			inner.WriteString(tok.raw)
			depth = 1
			continue
		}
		switch tok.kind {
		case tokenOpen:
			tn := tok.tagName()
			if blockTags[tn] && !inlineableBlocks[tn] {
				// Contains a non-inlineable block child — can't inline
				return "", 0
			}
			if tn == tagName {
				depth++
			}
			inner.WriteString(tok.raw)
		case tokenClose:
			tn := tok.tagName()
			if tn == tagName {
				depth--
				if depth == 0 {
					inner.WriteString(tok.raw)
					result := inner.String()
					if runeWidth(result) <= defaultMaxLineWidth {
						return result, j + 1
					}
					return "", 0
				}
			}
			inner.WriteString(tok.raw)
		case tokenText:
			text := collapseWS(tok.raw)
			if text == "" {
				continue
			}
			// Trim leading space only for the first text token after open tag
			if j == 1 {
				text = strings.TrimLeft(text, " ")
			}
			inner.WriteString(text)
		case tokenCDATA:
			// CDATA in an inlineable block — don't inline if multiline
			if strings.Contains(tok.raw, "\n") {
				return "", 0
			}
			inner.WriteString(tok.raw)
		default:
			inner.WriteString(tok.raw)
		}
	}
	return "", 0
}

// wrapLine splits a long line at word boundaries, preserving leading indentation.
// It is XML-aware: it won't break inside tags (< ... >).
func wrapLine(line string, maxWidth int) []string {
	// Extract leading indentation
	trimmed := strings.TrimLeft(line, " \t")
	indentStr := line[:len(line)-len(trimmed)]
	contIndent := indentStr + "  " // continuation lines get extra indent

	// Split into segments: tags (unsplittable) and text (splittable at spaces)
	segments := splitSegments(trimmed)

	var lines []string
	var cur strings.Builder
	cur.WriteString(indentStr)
	curWidth := runeWidth(indentStr)

	for _, seg := range segments {
		segW := runeWidth(seg)

		if seg == "" {
			continue
		}

		// Tags and non-space text: never break inside
		if strings.HasPrefix(seg, "<") {
			// If adding this tag exceeds limit and we have content, wrap
			if curWidth+segW > maxWidth && curWidth > runeWidth(indentStr) {
				lines = append(lines, strings.TrimRight(cur.String(), " "))
				cur.Reset()
				cur.WriteString(contIndent)
				curWidth = runeWidth(contIndent)
			}
			cur.WriteString(seg)
			curWidth += segW
			continue
		}

		// Text segment: split at word boundaries
		words := strings.Fields(seg)
		// Preserve leading space if original had one
		needSpace := len(seg) > 0 && seg[0] == ' '

		for _, word := range words {
			wordW := runeWidth(word)
			spaceW := 0
			if needSpace {
				spaceW = 1
			}

			if curWidth+spaceW+wordW > maxWidth && curWidth > runeWidth(contIndent) {
				lines = append(lines, strings.TrimRight(cur.String(), " "))
				cur.Reset()
				cur.WriteString(contIndent)
				curWidth = runeWidth(contIndent)
				needSpace = false
			}

			if needSpace {
				cur.WriteByte(' ')
				curWidth++
			}
			cur.WriteString(word)
			curWidth += wordW
			needSpace = true
		}
	}

	if cur.Len() > 0 {
		final := strings.TrimRight(cur.String(), " ")
		if final != "" {
			lines = append(lines, final)
		}
	}

	if len(lines) == 0 {
		return []string{line}
	}
	return lines
}

// splitSegments breaks text into alternating tag and text segments.
// E.g. "Hello <strong>world</strong> end" -> ["Hello ", "<strong>", "world", "</strong>", " end"]
func splitSegments(s string) []string {
	var segs []string
	for len(s) > 0 {
		lt := strings.Index(s, "<")
		if lt == -1 {
			segs = append(segs, s)
			break
		}
		if lt > 0 {
			segs = append(segs, s[:lt])
		}
		gt := strings.Index(s[lt:], ">")
		if gt == -1 {
			segs = append(segs, s[lt:])
			break
		}
		segs = append(segs, s[lt:lt+gt+1])
		s = s[lt+gt+1:]
	}
	return segs
}

func runeWidth(s string) int {
	return utf8.RuneCountInString(s)
}

func ensureIndentedLine(buf *strings.Builder, level int, indent string, atLineStart *bool) {
	if !*atLineStart {
		buf.WriteString("\n")
	}
	writeIndentPrefix(buf, level, indent)
	*atLineStart = false
}

func writeIndentPrefix(buf *strings.Builder, level int, indent string) {
	for range level {
		buf.WriteString(indent)
	}
}

func collapseWS(s string) string {
	var buf strings.Builder
	inWS := false
	for _, r := range s {
		if r == ' ' || r == '\t' || r == '\n' || r == '\r' {
			if !inWS {
				buf.WriteByte(' ')
				inWS = true
			}
		} else {
			buf.WriteRune(r)
			inWS = false
		}
	}
	return buf.String()
}

// Token types for the XML tokenizer.
type tokenKind int

const (
	tokenOpen      tokenKind = iota // <tag ...>
	tokenClose                      // </tag>
	tokenSelfClose                  // <tag .../>
	tokenText                       // plain text
	tokenCDATA                      // <![CDATA[...]]>
	tokenComment                    // <!-- ... -->
)

type token struct {
	kind tokenKind
	raw  string
}

func (t token) tagName() string {
	s := t.raw
	switch t.kind {
	case tokenOpen, tokenSelfClose:
		s = s[1:]
		if strings.HasSuffix(s, "/>") {
			s = s[:len(s)-2]
		} else {
			s = strings.TrimSuffix(s, ">")
		}
		if idx := strings.IndexAny(s, " \t\n"); idx > 0 {
			s = s[:idx]
		}
		return strings.ToLower(s)
	case tokenClose:
		s = s[2:]
		s = strings.TrimSuffix(s, ">")
		return strings.ToLower(strings.TrimSpace(s))
	}
	return ""
}

func tokenize(input string) []token {
	var tokens []token
	i := 0
	for i < len(input) {
		if input[i] == '<' {
			if strings.HasPrefix(input[i:], "<![CDATA[") {
				end := strings.Index(input[i:], "]]>")
				if end == -1 {
					tokens = append(tokens, token{tokenCDATA, input[i:]})
					break
				}
				tokens = append(tokens, token{tokenCDATA, input[i : i+end+3]})
				i += end + 3
				continue
			}
			if strings.HasPrefix(input[i:], "<!--") {
				end := strings.Index(input[i:], "-->")
				if end == -1 {
					tokens = append(tokens, token{tokenComment, input[i:]})
					break
				}
				tokens = append(tokens, token{tokenComment, input[i : i+end+3]})
				i += end + 3
				continue
			}
			end := strings.Index(input[i:], ">")
			if end == -1 {
				tokens = append(tokens, token{tokenText, input[i:]})
				break
			}
			tagStr := input[i : i+end+1]
			if strings.HasPrefix(tagStr, "</") {
				tokens = append(tokens, token{tokenClose, tagStr})
			} else if strings.HasSuffix(tagStr, "/>") {
				tokens = append(tokens, token{tokenSelfClose, tagStr})
			} else {
				tokens = append(tokens, token{tokenOpen, tagStr})
			}
			i += end + 1
		} else {
			end := strings.Index(input[i:], "<")
			if end == -1 {
				tokens = append(tokens, token{tokenText, input[i:]})
				break
			}
			tokens = append(tokens, token{tokenText, input[i : i+end]})
			i += end
		}
	}
	return tokens
}