~bigbes/confluence-md-utilities (fb67f6989dc2cabe60498e0e7b8f7a51e1e9826b): converter/xml2md.go

package converter

import (
	"bytes"
	"fmt"
	htmlpkg "html"
	"strings"

	"golang.org/x/net/html"
)

// ConfluenceToMarkdown converts Confluence storage format XML to Markdown.
func ConfluenceToMarkdown(source string) (string, error) {
	// Preprocess: extract CDATA content and replace with escaped text,
	// because x/net/html doesn't handle CDATA sections.
	preprocessed := preprocessCDATA(source)

	// Wrap in a root element so the HTML parser handles it correctly.
	wrapped := "<div>" + preprocessed + "</div>"
	doc, err := html.Parse(strings.NewReader(wrapped))
	if err != nil {
		return "", fmt.Errorf("parsing confluence xml: %w", err)
	}

	var buf bytes.Buffer
	c := &xmlConverter{buf: &buf}

	// Navigate to the wrapper div: html > head > body > div
	body := findNode(doc, "body")
	if body == nil {
		return "", fmt.Errorf("unexpected parse structure")
	}
	wrapper := body.FirstChild
	if wrapper != nil {
		c.walkChildren(wrapper, 0)
	}

	result := buf.String()
	// Clean up excessive blank lines
	for strings.Contains(result, "\n\n\n") {
		result = strings.ReplaceAll(result, "\n\n\n", "\n\n")
	}
	return strings.TrimSpace(result) + "\n", nil
}

// preprocessCDATA replaces <![CDATA[...]]> with the content as a data attribute
// on the parent element, since x/net/html doesn't parse CDATA.
func preprocessCDATA(s string) string {
	var result strings.Builder
	for {
		idx := strings.Index(s, "<![CDATA[")
		if idx == -1 {
			result.WriteString(s)
			break
		}
		result.WriteString(s[:idx])
		s = s[idx+len("<![CDATA["):]
		endIdx := strings.Index(s, "]]>")
		if endIdx == -1 {
			result.WriteString(s)
			break
		}
		// Write CDATA content as a special element that we can detect
		content := s[:endIdx]
		result.WriteString("<cdatacontent>")
		result.WriteString(htmlpkg.EscapeString(content))
		result.WriteString("</cdatacontent>")
		s = s[endIdx+len("]]>"):]
	}
	return result.String()
}

type xmlConverter struct {
	buf        *bytes.Buffer
	listDepth  int
	inListItem bool
}

func (c *xmlConverter) walkChildren(n *html.Node, depth int) {
	for child := n.FirstChild; child != nil; child = child.NextSibling {
		c.walk(child, depth)
	}
}

func (c *xmlConverter) walk(n *html.Node, depth int) {
	if n.Type == html.TextNode {
		text := n.Data
		// Skip whitespace-only text nodes inside lists
		if c.listDepth > 0 && strings.TrimSpace(text) == "" {
			return
		}
		// Collapse whitespace in text nodes (XML indentation artifacts)
		if strings.TrimSpace(text) != "" {
			// Replace sequences of whitespace (including newlines) with single space,
			// but preserve the trimmed content
			text = collapseWhitespace(text)
		}
		c.buf.WriteString(text)
		return
	}

	if n.Type != html.ElementNode {
		c.walkChildren(n, depth)
		return
	}

	tag := strings.ToLower(n.Data)

	switch {
	// Headings
	case tag == "h1":
		c.buf.WriteString("\n# ")
		c.walkChildren(n, depth)
		c.buf.WriteString("\n\n")
	case tag == "h2":
		c.buf.WriteString("\n## ")
		c.walkChildren(n, depth)
		c.buf.WriteString("\n\n")
	case tag == "h3":
		c.buf.WriteString("\n### ")
		c.walkChildren(n, depth)
		c.buf.WriteString("\n\n")
	case tag == "h4":
		c.buf.WriteString("\n#### ")
		c.walkChildren(n, depth)
		c.buf.WriteString("\n\n")
	case tag == "h5":
		c.buf.WriteString("\n##### ")
		c.walkChildren(n, depth)
		c.buf.WriteString("\n\n")
	case tag == "h6":
		c.buf.WriteString("\n###### ")
		c.walkChildren(n, depth)
		c.buf.WriteString("\n\n")

	// Paragraphs
	case tag == "p":
		c.walkChildren(n, depth)
		if !c.inListItem {
			c.buf.WriteString("\n\n")
		}

	// Inline formatting
	case tag == "strong", tag == "b":
		c.buf.WriteString("**")
		c.walkChildren(n, depth)
		c.buf.WriteString("**")
	case tag == "em", tag == "i":
		c.buf.WriteString("*")
		c.walkChildren(n, depth)
		c.buf.WriteString("*")
	case tag == "del", tag == "s":
		c.buf.WriteString("~~")
		c.walkChildren(n, depth)
		c.buf.WriteString("~~")
	case tag == "code":
		c.buf.WriteString("`")
		c.walkChildren(n, depth)
		c.buf.WriteString("`")

	// Links
	case tag == "a":
		href := getAttr(n, "href")
		c.buf.WriteString("[")
		c.walkChildren(n, depth)
		c.buf.WriteString("](")
		c.buf.WriteString(href)
		c.buf.WriteString(")")

	// Line break
	case tag == "br":
		c.buf.WriteString("  \n")

	// Horizontal rule
	case tag == "hr":
		c.buf.WriteString("\n---\n\n")

	// Lists
	case tag == "ul":
		c.listDepth++
		if c.listDepth == 1 {
			c.buf.WriteString("\n")
		}
		c.walkChildren(n, depth)
		c.listDepth--
		if c.listDepth == 0 {
			c.buf.WriteString("\n")
		}
	case tag == "ol":
		c.listDepth++
		if c.listDepth == 1 {
			c.buf.WriteString("\n")
		}
		c.walkOL(n, depth)
		c.listDepth--
		if c.listDepth == 0 {
			c.buf.WriteString("\n")
		}
	case tag == "li":
		prev := c.inListItem
		c.inListItem = true
		// Check if this list item contains a task checkbox
		if hasTaskStatus(n) {
			// Task status handler will write the prefix, walkChildrenInline for text
			c.walkChildrenInline(n, depth)
			c.buf.WriteString("\n")
		} else {
			indent := strings.Repeat("  ", max(0, c.listDepth-1))
			c.buf.WriteString(indent)
			c.buf.WriteString("- ")
			c.walkChildrenInline(n, depth)
			c.buf.WriteString("\n")
		}
		c.inListItem = prev

	// Tables - convert to GFM table
	case tag == "table":
		c.renderTable(n, depth)

	// Confluence macros - handled via ac:* namespace (parsed as ac-*)
	// The HTML parser lowercases and handles colons differently.
	// We need to handle both ac:structured-macro and the parsed form.

	// Skip layout/structural elements, pass through children
	case tag == "div", tag == "span", tag == "tbody", tag == "thead",
		tag == "colgroup", tag == "col", tag == "content-wrapper":
		c.walkChildren(n, depth)

	// Handle Confluence-specific elements
	default:
		c.handleConfluenceElement(n, tag, depth)
	}
}

func (c *xmlConverter) handleConfluenceElement(n *html.Node, tag string, depth int) {
	switch {
	// Confluence structured macros (code blocks, panels, etc.)
	case strings.Contains(tag, "structured-macro") || strings.Contains(tag, "ac:structured-macro"):
		macroName := getAttr(n, "ac:name")
		if macroName == "" {
			macroName = getAttr(n, "name")
		}
		switch macroName {
		case "code":
			c.renderCodeMacro(n)
		case "info":
			c.renderPanelAsBlockquote(n, depth)
		case "note":
			c.renderPanelAsBlockquote(n, depth)
		case "warning":
			c.renderPanelAsBlockquote(n, depth)
		case "toc":
			// Skip TOC macros
		default:
			c.walkChildren(n, depth)
		}

	// Confluence images
	case strings.Contains(tag, "image") || strings.Contains(tag, "ac:image"):
		alt := getAttr(n, "ac:alt")
		if alt == "" {
			alt = getAttr(n, "alt")
		}
		imgRef := c.findImageRef(n)
		if imgRef.isAttachment {
			// Preserve attachment reference as round-trippable HTML
			fmt.Fprintf(c.buf, `<span data-attachment="%s"`, imgRef.filename)
			if alt != "" {
				fmt.Fprintf(c.buf, ` data-alt="%s"`, alt)
			}
			c.buf.WriteString("/>")
		} else {
			c.buf.WriteString("![")
			c.buf.WriteString(alt)
			c.buf.WriteString("](")
			c.buf.WriteString(imgRef.url)
			c.buf.WriteString(")")
		}

	// Confluence links (user mentions, page links)
	case strings.Contains(tag, "ac:link"):
		if c.hasUserChild(n) {
			c.walkChildren(n, depth)
		} else {
			c.walkChildren(n, depth)
		}

	// Confluence emoticons
	case strings.Contains(tag, "emoticon") || strings.Contains(tag, "ac:emoticon"):
		name := getAttr(n, "ac:name")
		if name == "" {
			name = getAttr(n, "name")
		}
		switch name {
		case "plus":
			c.buf.WriteString("(+)")
		case "minus":
			c.buf.WriteString("(-)")
		case "question":
			c.buf.WriteString("(?)")
		case "tick":
			c.buf.WriteString("(v)")
		case "cross":
			c.buf.WriteString("(x)")
		}

	// Confluence task lists
	case strings.Contains(tag, "task-list"):
		c.listDepth++
		c.walkChildren(n, depth)
		c.listDepth--
	case strings.Contains(tag, "task-body"):
		c.walkChildren(n, depth)
		c.buf.WriteString("\n")
	case strings.Contains(tag, "task-status"):
		status := strings.TrimSpace(getTextContent(n))
		indent := strings.Repeat("  ", max(0, c.listDepth-1))
		if status == "complete" {
			c.buf.WriteString(indent + "- [x] ")
		} else {
			c.buf.WriteString(indent + "- [ ] ")
		}
	case strings.Contains(tag, "task-id"):
		// Skip task IDs
	case strings.Contains(tag, "task") && !strings.Contains(tag, "task-"):
		c.walkChildren(n, depth)

	// Confluence inline comment markers — preserve as span with data attribute
	case strings.Contains(tag, "inline-comment-marker"):
		ref := getAttr(n, "ac:ref")
		if ref == "" {
			ref = getAttr(n, "ref")
		}
		if ref != "" {
			fmt.Fprintf(c.buf, `<span data-inline-comment="%s">`, ref)
			c.walkChildren(n, depth)
			c.buf.WriteString("</span>")
		} else {
			c.walkChildren(n, depth)
		}

	// User references — preserve as round-trippable HTML span
	case strings.Contains(tag, "ri:user"):
		userKey := getAttr(n, "ri:userkey")
		if userKey == "" {
			userKey = getAttr(n, "userkey")
		}
		if userKey != "" {
			fmt.Fprintf(c.buf, `<span data-user-key="%s"/>`, userKey)
		}

	// Time elements
	case tag == "time":
		datetime := getAttr(n, "datetime")
		if datetime != "" {
			c.buf.WriteString(datetime)
		}

	// Fallback: just walk children
	default:
		c.walkChildren(n, depth)
	}
}

func (c *xmlConverter) renderCodeMacro(n *html.Node) {
	language := ""
	code := ""

	// Walk children to find parameters and body
	var walkMacro func(*html.Node)
	walkMacro = func(node *html.Node) {
		if node.Type == html.ElementNode {
			tag := strings.ToLower(node.Data)
			if strings.Contains(tag, "parameter") || strings.Contains(tag, "ac:parameter") {
				name := getAttr(node, "ac:name")
				if name == "" {
					name = getAttr(node, "name")
				}
				if name == "language" {
					language = getTextContent(node)
				}
			}
			if strings.Contains(tag, "plain-text-body") || strings.Contains(tag, "ac:plain-text-body") {
				code = getCDATAContent(node)
			}
		}
		for child := node.FirstChild; child != nil; child = child.NextSibling {
			walkMacro(child)
		}
	}
	walkMacro(n)

	c.buf.WriteString("\n```")
	c.buf.WriteString(language)
	c.buf.WriteString("\n")
	c.buf.WriteString(code)
	if !strings.HasSuffix(code, "\n") {
		c.buf.WriteString("\n")
	}
	c.buf.WriteString("```\n\n")
}

func (c *xmlConverter) renderPanelAsBlockquote(n *html.Node, depth int) {
	// Collect panel body content
	var bodyBuf bytes.Buffer
	origBuf := c.buf
	c.buf = &bodyBuf

	// Find rich-text-body and walk it
	var findBody func(*html.Node)
	findBody = func(node *html.Node) {
		if node.Type == html.ElementNode {
			tag := strings.ToLower(node.Data)
			if strings.Contains(tag, "rich-text-body") {
				c.walkChildren(node, depth)
				return
			}
		}
		for child := node.FirstChild; child != nil; child = child.NextSibling {
			findBody(child)
		}
	}
	findBody(n)

	c.buf = origBuf
	text := strings.TrimSpace(bodyBuf.String())
	lines := strings.Split(text, "\n")
	for _, line := range lines {
		c.buf.WriteString("> ")
		c.buf.WriteString(line)
		c.buf.WriteString("\n")
	}
	c.buf.WriteString("\n")
}

func (c *xmlConverter) renderTable(n *html.Node, depth int) {
	rows := collectTableRows(n)
	if len(rows) == 0 {
		return
	}

	// Determine column count
	cols := 0
	for _, row := range rows {
		if len(row.cells) > cols {
			cols = len(row.cells)
		}
	}
	if cols == 0 {
		return
	}

	c.buf.WriteString("\n")

	// If first row is a header
	isFirstRowHeader := len(rows) > 0 && rows[0].isHeader
	startIdx := 0

	if isFirstRowHeader {
		c.writeTableRow(rows[0].cells, cols)
		c.writeTableSep(cols)
		startIdx = 1
	} else {
		// Write empty header and separator
		empty := make([]string, cols)
		c.writeTableRow(empty, cols)
		c.writeTableSep(cols)
	}

	for i := startIdx; i < len(rows); i++ {
		c.writeTableRow(rows[i].cells, cols)
	}
	c.buf.WriteString("\n")
}

func (c *xmlConverter) writeTableRow(cells []string, cols int) {
	c.buf.WriteString("|")
	for i := range cols {
		cell := ""
		if i < len(cells) {
			cell = cells[i]
		}
		c.buf.WriteString(" ")
		c.buf.WriteString(cell)
		c.buf.WriteString(" |")
	}
	c.buf.WriteString("\n")
}

func (c *xmlConverter) writeTableSep(cols int) {
	c.buf.WriteString("|")
	for range cols {
		c.buf.WriteString("---|")
	}
	c.buf.WriteString("\n")
}

func (c *xmlConverter) walkOL(n *html.Node, depth int) {
	idx := 1
	for child := n.FirstChild; child != nil; child = child.NextSibling {
		if child.Type != html.ElementNode {
			continue
		}
		tag := strings.ToLower(child.Data)
		if tag == "li" {
			indent := strings.Repeat("  ", max(0, c.listDepth-1))
			c.buf.WriteString(indent)
			fmt.Fprintf(c.buf, "%d. ", idx)
			c.walkChildrenInline(child, depth)
			c.buf.WriteString("\n")
			idx++
		}
	}
}

func (c *xmlConverter) walkChildrenInline(n *html.Node, depth int) {
	for child := n.FirstChild; child != nil; child = child.NextSibling {
		if child.Type == html.TextNode {
			// Collapse whitespace but preserve a single space between inline elements
			text := collapseWhitespace(child.Data)
			// Only trim leading space if this is the very first child
			if child == n.FirstChild {
				text = strings.TrimLeft(text, " ")
			}
			// Only trim trailing space if this is the very last child
			if child.NextSibling == nil {
				text = strings.TrimRight(text, " ")
			}
			if text != "" {
				c.buf.WriteString(text)
			}
			continue
		}
		if child.Type == html.ElementNode {
			tag := strings.ToLower(child.Data)
			switch {
			case tag == "p":
				c.walkChildrenInline(child, depth)
			case tag == "ul", tag == "ol":
				c.buf.WriteString("\n")
				c.walk(child, depth)
			default:
				c.walk(child, depth)
			}
		}
	}
}

type tableRow struct {
	isHeader bool
	cells    []string
}

func collectTableRows(table *html.Node) []tableRow {
	var rows []tableRow
	var walk func(*html.Node, bool)
	walk = func(n *html.Node, inHeader bool) {
		if n.Type == html.ElementNode {
			tag := strings.ToLower(n.Data)
			switch tag {
			case "thead":
				for child := n.FirstChild; child != nil; child = child.NextSibling {
					walk(child, true)
				}
				return
			case "tbody":
				for child := n.FirstChild; child != nil; child = child.NextSibling {
					walk(child, false)
				}
				return
			case "tr":
				row := tableRow{isHeader: inHeader}
				for child := n.FirstChild; child != nil; child = child.NextSibling {
					if child.Type == html.ElementNode {
						cellTag := strings.ToLower(child.Data)
						if cellTag == "th" {
							row.isHeader = true
							row.cells = append(row.cells, strings.TrimSpace(getTextContent(child)))
						} else if cellTag == "td" {
							row.cells = append(row.cells, strings.TrimSpace(getTextContent(child)))
						}
					}
				}
				rows = append(rows, row)
				return
			}
		}
		for child := n.FirstChild; child != nil; child = child.NextSibling {
			walk(child, inHeader)
		}
	}
	walk(table, false)
	return rows
}

type imageRef struct {
	url          string
	filename     string
	isAttachment bool
}

func (c *xmlConverter) findImageRef(n *html.Node) imageRef {
	var ref imageRef
	var walk func(*html.Node)
	walk = func(node *html.Node) {
		if node.Type == html.ElementNode {
			tag := strings.ToLower(node.Data)
			// <ri:url ri:value="..."/>
			if strings.Contains(tag, "url") {
				v := getAttr(node, "ri:value")
				if v == "" {
					v = getAttr(node, "value")
				}
				if v != "" {
					ref.url = v
					return
				}
			}
			// <ri:attachment ri:filename="..."/>
			if strings.Contains(tag, "attachment") {
				f := getAttr(node, "ri:filename")
				if f == "" {
					f = getAttr(node, "filename")
				}
				if f != "" {
					ref.filename = f
					ref.isAttachment = true
					return
				}
			}
		}
		for child := node.FirstChild; child != nil; child = child.NextSibling {
			walk(child)
		}
	}
	walk(n)
	return ref
}

func (c *xmlConverter) hasUserChild(n *html.Node) bool {
	for child := n.FirstChild; child != nil; child = child.NextSibling {
		if child.Type == html.ElementNode {
			tag := strings.ToLower(child.Data)
			if strings.Contains(tag, "user") {
				return true
			}
		}
	}
	return false
}

// Helper functions

func findNode(n *html.Node, tag string) *html.Node {
	if n.Type == html.ElementNode && n.Data == tag {
		return n
	}
	for child := n.FirstChild; child != nil; child = child.NextSibling {
		if found := findNode(child, tag); found != nil {
			return found
		}
	}
	return nil
}

func getAttr(n *html.Node, key string) string {
	for _, attr := range n.Attr {
		attrKey := attr.Key
		if attr.Namespace != "" {
			attrKey = attr.Namespace + ":" + attr.Key
		}
		if attrKey == key {
			return attr.Val
		}
	}
	return ""
}

// collapseWhitespace replaces runs of whitespace with a single space,
// preserving leading/trailing single space if original had whitespace there.
func collapseWhitespace(s string) string {
	var buf strings.Builder
	inWS := false
	for _, r := range s {
		if r == ' ' || r == '\t' || r == '\n' || r == '\r' {
			if !inWS {
				buf.WriteByte(' ')
				inWS = true
			}
		} else {
			buf.WriteRune(r)
			inWS = false
		}
	}
	return buf.String()
}

// hasTaskStatus checks if a node contains a task-status element.
func hasTaskStatus(n *html.Node) bool {
	for child := n.FirstChild; child != nil; child = child.NextSibling {
		if child.Type == html.ElementNode {
			tag := strings.ToLower(child.Data)
			if strings.Contains(tag, "task-status") {
				return true
			}
		}
	}
	return false
}

// getCDATAContent retrieves content from preprocessed CDATA sections.
// It looks for <cdatacontent> elements and unescapes their text.
func getCDATAContent(n *html.Node) string {
	var buf bytes.Buffer
	var walk func(*html.Node)
	walk = func(node *html.Node) {
		if node.Type == html.ElementNode && node.Data == "cdatacontent" {
			text := getTextContent(node)
			buf.WriteString(htmlpkg.UnescapeString(text))
			return
		}
		if node.Type == html.TextNode {
			buf.WriteString(node.Data)
		}
		for child := node.FirstChild; child != nil; child = child.NextSibling {
			walk(child)
		}
	}
	walk(n)
	return buf.String()
}

func getTextContent(n *html.Node) string {
	var buf bytes.Buffer
	var walk func(*html.Node)
	walk = func(node *html.Node) {
		if node.Type == html.TextNode {
			buf.WriteString(node.Data)
		}
		for child := node.FirstChild; child != nil; child = child.NextSibling {
			walk(child)
		}
	}
	walk(n)
	return buf.String()
}