package converter
import (
"bytes"
"fmt"
htmlpkg "html"
"strings"
"golang.org/x/net/html"
)
// ConfluenceToMarkdown converts Confluence storage format XML to Markdown.
func ConfluenceToMarkdown(source string) (string, error) {
// Preprocess: extract CDATA content and replace with escaped text,
// because x/net/html doesn't handle CDATA sections.
preprocessed := preprocessCDATA(source)
// Wrap in a root element so the HTML parser handles it correctly.
wrapped := "<div>" + preprocessed + "</div>"
doc, err := html.Parse(strings.NewReader(wrapped))
if err != nil {
return "", fmt.Errorf("parsing confluence xml: %w", err)
}
var buf bytes.Buffer
c := &xmlConverter{buf: &buf}
// Navigate to the wrapper div: html > head > body > div
body := findNode(doc, "body")
if body == nil {
return "", fmt.Errorf("unexpected parse structure")
}
wrapper := body.FirstChild
if wrapper != nil {
c.walkChildren(wrapper, 0)
}
result := buf.String()
// Clean up excessive blank lines
for strings.Contains(result, "\n\n\n") {
result = strings.ReplaceAll(result, "\n\n\n", "\n\n")
}
return strings.TrimSpace(result) + "\n", nil
}
// preprocessCDATA replaces <![CDATA[...]]> with the content as a data attribute
// on the parent element, since x/net/html doesn't parse CDATA.
func preprocessCDATA(s string) string {
var result strings.Builder
for {
idx := strings.Index(s, "<![CDATA[")
if idx == -1 {
result.WriteString(s)
break
}
result.WriteString(s[:idx])
s = s[idx+len("<![CDATA["):]
endIdx := strings.Index(s, "]]>")
if endIdx == -1 {
result.WriteString(s)
break
}
// Write CDATA content as a special element that we can detect
content := s[:endIdx]
result.WriteString("<cdatacontent>")
result.WriteString(htmlpkg.EscapeString(content))
result.WriteString("</cdatacontent>")
s = s[endIdx+len("]]>"):]
}
return result.String()
}
type xmlConverter struct {
buf *bytes.Buffer
listDepth int
inListItem bool
}
func (c *xmlConverter) walkChildren(n *html.Node, depth int) {
for child := n.FirstChild; child != nil; child = child.NextSibling {
c.walk(child, depth)
}
}
func (c *xmlConverter) walk(n *html.Node, depth int) {
if n.Type == html.TextNode {
text := n.Data
// Skip whitespace-only text nodes inside lists
if c.listDepth > 0 && strings.TrimSpace(text) == "" {
return
}
// Collapse whitespace in text nodes (XML indentation artifacts)
if strings.TrimSpace(text) != "" {
// Replace sequences of whitespace (including newlines) with single space,
// but preserve the trimmed content
text = collapseWhitespace(text)
}
c.buf.WriteString(text)
return
}
if n.Type != html.ElementNode {
c.walkChildren(n, depth)
return
}
tag := strings.ToLower(n.Data)
switch {
// Headings
case tag == "h1":
c.buf.WriteString("\n# ")
c.walkChildren(n, depth)
c.buf.WriteString("\n\n")
case tag == "h2":
c.buf.WriteString("\n## ")
c.walkChildren(n, depth)
c.buf.WriteString("\n\n")
case tag == "h3":
c.buf.WriteString("\n### ")
c.walkChildren(n, depth)
c.buf.WriteString("\n\n")
case tag == "h4":
c.buf.WriteString("\n#### ")
c.walkChildren(n, depth)
c.buf.WriteString("\n\n")
case tag == "h5":
c.buf.WriteString("\n##### ")
c.walkChildren(n, depth)
c.buf.WriteString("\n\n")
case tag == "h6":
c.buf.WriteString("\n###### ")
c.walkChildren(n, depth)
c.buf.WriteString("\n\n")
// Paragraphs
case tag == "p":
c.walkChildren(n, depth)
if !c.inListItem {
c.buf.WriteString("\n\n")
}
// Inline formatting
case tag == "strong", tag == "b":
c.buf.WriteString("**")
c.walkChildren(n, depth)
c.buf.WriteString("**")
case tag == "em", tag == "i":
c.buf.WriteString("*")
c.walkChildren(n, depth)
c.buf.WriteString("*")
case tag == "del", tag == "s":
c.buf.WriteString("~~")
c.walkChildren(n, depth)
c.buf.WriteString("~~")
case tag == "code":
c.buf.WriteString("`")
c.walkChildren(n, depth)
c.buf.WriteString("`")
// Links
case tag == "a":
href := getAttr(n, "href")
c.buf.WriteString("[")
c.walkChildren(n, depth)
c.buf.WriteString("](")
c.buf.WriteString(href)
c.buf.WriteString(")")
// Line break
case tag == "br":
c.buf.WriteString(" \n")
// Horizontal rule
case tag == "hr":
c.buf.WriteString("\n---\n\n")
// Lists
case tag == "ul":
c.listDepth++
if c.listDepth == 1 {
c.buf.WriteString("\n")
}
c.walkChildren(n, depth)
c.listDepth--
if c.listDepth == 0 {
c.buf.WriteString("\n")
}
case tag == "ol":
c.listDepth++
if c.listDepth == 1 {
c.buf.WriteString("\n")
}
c.walkOL(n, depth)
c.listDepth--
if c.listDepth == 0 {
c.buf.WriteString("\n")
}
case tag == "li":
prev := c.inListItem
c.inListItem = true
// Check if this list item contains a task checkbox
if hasTaskStatus(n) {
// Task status handler will write the prefix, walkChildrenInline for text
c.walkChildrenInline(n, depth)
c.buf.WriteString("\n")
} else {
indent := strings.Repeat(" ", max(0, c.listDepth-1))
c.buf.WriteString(indent)
c.buf.WriteString("- ")
c.walkChildrenInline(n, depth)
c.buf.WriteString("\n")
}
c.inListItem = prev
// Tables - convert to GFM table
case tag == "table":
c.renderTable(n, depth)
// Confluence macros - handled via ac:* namespace (parsed as ac-*)
// The HTML parser lowercases and handles colons differently.
// We need to handle both ac:structured-macro and the parsed form.
// Skip layout/structural elements, pass through children
case tag == "div", tag == "span", tag == "tbody", tag == "thead",
tag == "colgroup", tag == "col", tag == "content-wrapper":
c.walkChildren(n, depth)
// Handle Confluence-specific elements
default:
c.handleConfluenceElement(n, tag, depth)
}
}
func (c *xmlConverter) handleConfluenceElement(n *html.Node, tag string, depth int) {
switch {
// Confluence structured macros (code blocks, panels, etc.)
case strings.Contains(tag, "structured-macro") || strings.Contains(tag, "ac:structured-macro"):
macroName := getAttr(n, "ac:name")
if macroName == "" {
macroName = getAttr(n, "name")
}
switch macroName {
case "code":
c.renderCodeMacro(n)
case "info":
c.renderPanelAsBlockquote(n, depth)
case "note":
c.renderPanelAsBlockquote(n, depth)
case "warning":
c.renderPanelAsBlockquote(n, depth)
case "toc":
// Skip TOC macros
default:
c.walkChildren(n, depth)
}
// Confluence images
case strings.Contains(tag, "image") || strings.Contains(tag, "ac:image"):
alt := getAttr(n, "ac:alt")
if alt == "" {
alt = getAttr(n, "alt")
}
imgRef := c.findImageRef(n)
if imgRef.isAttachment {
// Preserve attachment reference as round-trippable HTML
fmt.Fprintf(c.buf, `<span data-attachment="%s"`, imgRef.filename)
if alt != "" {
fmt.Fprintf(c.buf, ` data-alt="%s"`, alt)
}
c.buf.WriteString("/>")
} else {
c.buf.WriteString("
c.buf.WriteString(imgRef.url)
c.buf.WriteString(")")
}
// Confluence links (user mentions, page links)
case strings.Contains(tag, "ac:link"):
if c.hasUserChild(n) {
c.walkChildren(n, depth)
} else {
c.walkChildren(n, depth)
}
// Confluence emoticons
case strings.Contains(tag, "emoticon") || strings.Contains(tag, "ac:emoticon"):
name := getAttr(n, "ac:name")
if name == "" {
name = getAttr(n, "name")
}
switch name {
case "plus":
c.buf.WriteString("(+)")
case "minus":
c.buf.WriteString("(-)")
case "question":
c.buf.WriteString("(?)")
case "tick":
c.buf.WriteString("(v)")
case "cross":
c.buf.WriteString("(x)")
}
// Confluence task lists
case strings.Contains(tag, "task-list"):
c.listDepth++
c.walkChildren(n, depth)
c.listDepth--
case strings.Contains(tag, "task-body"):
c.walkChildren(n, depth)
c.buf.WriteString("\n")
case strings.Contains(tag, "task-status"):
status := strings.TrimSpace(getTextContent(n))
indent := strings.Repeat(" ", max(0, c.listDepth-1))
if status == "complete" {
c.buf.WriteString(indent + "- [x] ")
} else {
c.buf.WriteString(indent + "- [ ] ")
}
case strings.Contains(tag, "task-id"):
// Skip task IDs
case strings.Contains(tag, "task") && !strings.Contains(tag, "task-"):
c.walkChildren(n, depth)
// Confluence inline comment markers — preserve as span with data attribute
case strings.Contains(tag, "inline-comment-marker"):
ref := getAttr(n, "ac:ref")
if ref == "" {
ref = getAttr(n, "ref")
}
if ref != "" {
fmt.Fprintf(c.buf, `<span data-inline-comment="%s">`, ref)
c.walkChildren(n, depth)
c.buf.WriteString("</span>")
} else {
c.walkChildren(n, depth)
}
// User references — preserve as round-trippable HTML span
case strings.Contains(tag, "ri:user"):
userKey := getAttr(n, "ri:userkey")
if userKey == "" {
userKey = getAttr(n, "userkey")
}
if userKey != "" {
fmt.Fprintf(c.buf, `<span data-user-key="%s"/>`, userKey)
}
// Time elements
case tag == "time":
datetime := getAttr(n, "datetime")
if datetime != "" {
c.buf.WriteString(datetime)
}
// Fallback: just walk children
default:
c.walkChildren(n, depth)
}
}
func (c *xmlConverter) renderCodeMacro(n *html.Node) {
language := ""
code := ""
// Walk children to find parameters and body
var walkMacro func(*html.Node)
walkMacro = func(node *html.Node) {
if node.Type == html.ElementNode {
tag := strings.ToLower(node.Data)
if strings.Contains(tag, "parameter") || strings.Contains(tag, "ac:parameter") {
name := getAttr(node, "ac:name")
if name == "" {
name = getAttr(node, "name")
}
if name == "language" {
language = getTextContent(node)
}
}
if strings.Contains(tag, "plain-text-body") || strings.Contains(tag, "ac:plain-text-body") {
code = getCDATAContent(node)
}
}
for child := node.FirstChild; child != nil; child = child.NextSibling {
walkMacro(child)
}
}
walkMacro(n)
c.buf.WriteString("\n```")
c.buf.WriteString(language)
c.buf.WriteString("\n")
c.buf.WriteString(code)
if !strings.HasSuffix(code, "\n") {
c.buf.WriteString("\n")
}
c.buf.WriteString("```\n\n")
}
func (c *xmlConverter) renderPanelAsBlockquote(n *html.Node, depth int) {
// Collect panel body content
var bodyBuf bytes.Buffer
origBuf := c.buf
c.buf = &bodyBuf
// Find rich-text-body and walk it
var findBody func(*html.Node)
findBody = func(node *html.Node) {
if node.Type == html.ElementNode {
tag := strings.ToLower(node.Data)
if strings.Contains(tag, "rich-text-body") {
c.walkChildren(node, depth)
return
}
}
for child := node.FirstChild; child != nil; child = child.NextSibling {
findBody(child)
}
}
findBody(n)
c.buf = origBuf
text := strings.TrimSpace(bodyBuf.String())
lines := strings.Split(text, "\n")
for _, line := range lines {
c.buf.WriteString("> ")
c.buf.WriteString(line)
c.buf.WriteString("\n")
}
c.buf.WriteString("\n")
}
func (c *xmlConverter) renderTable(n *html.Node, depth int) {
rows := collectTableRows(n)
if len(rows) == 0 {
return
}
// Determine column count
cols := 0
for _, row := range rows {
if len(row.cells) > cols {
cols = len(row.cells)
}
}
if cols == 0 {
return
}
c.buf.WriteString("\n")
// If first row is a header
isFirstRowHeader := len(rows) > 0 && rows[0].isHeader
startIdx := 0
if isFirstRowHeader {
c.writeTableRow(rows[0].cells, cols)
c.writeTableSep(cols)
startIdx = 1
} else {
// Write empty header and separator
empty := make([]string, cols)
c.writeTableRow(empty, cols)
c.writeTableSep(cols)
}
for i := startIdx; i < len(rows); i++ {
c.writeTableRow(rows[i].cells, cols)
}
c.buf.WriteString("\n")
}
func (c *xmlConverter) writeTableRow(cells []string, cols int) {
c.buf.WriteString("|")
for i := range cols {
cell := ""
if i < len(cells) {
cell = cells[i]
}
c.buf.WriteString(" ")
c.buf.WriteString(cell)
c.buf.WriteString(" |")
}
c.buf.WriteString("\n")
}
func (c *xmlConverter) writeTableSep(cols int) {
c.buf.WriteString("|")
for range cols {
c.buf.WriteString("---|")
}
c.buf.WriteString("\n")
}
func (c *xmlConverter) walkOL(n *html.Node, depth int) {
idx := 1
for child := n.FirstChild; child != nil; child = child.NextSibling {
if child.Type != html.ElementNode {
continue
}
tag := strings.ToLower(child.Data)
if tag == "li" {
indent := strings.Repeat(" ", max(0, c.listDepth-1))
c.buf.WriteString(indent)
fmt.Fprintf(c.buf, "%d. ", idx)
c.walkChildrenInline(child, depth)
c.buf.WriteString("\n")
idx++
}
}
}
func (c *xmlConverter) walkChildrenInline(n *html.Node, depth int) {
for child := n.FirstChild; child != nil; child = child.NextSibling {
if child.Type == html.TextNode {
// Collapse whitespace but preserve a single space between inline elements
text := collapseWhitespace(child.Data)
// Only trim leading space if this is the very first child
if child == n.FirstChild {
text = strings.TrimLeft(text, " ")
}
// Only trim trailing space if this is the very last child
if child.NextSibling == nil {
text = strings.TrimRight(text, " ")
}
if text != "" {
c.buf.WriteString(text)
}
continue
}
if child.Type == html.ElementNode {
tag := strings.ToLower(child.Data)
switch {
case tag == "p":
c.walkChildrenInline(child, depth)
case tag == "ul", tag == "ol":
c.buf.WriteString("\n")
c.walk(child, depth)
default:
c.walk(child, depth)
}
}
}
}
type tableRow struct {
isHeader bool
cells []string
}
func collectTableRows(table *html.Node) []tableRow {
var rows []tableRow
var walk func(*html.Node, bool)
walk = func(n *html.Node, inHeader bool) {
if n.Type == html.ElementNode {
tag := strings.ToLower(n.Data)
switch tag {
case "thead":
for child := n.FirstChild; child != nil; child = child.NextSibling {
walk(child, true)
}
return
case "tbody":
for child := n.FirstChild; child != nil; child = child.NextSibling {
walk(child, false)
}
return
case "tr":
row := tableRow{isHeader: inHeader}
for child := n.FirstChild; child != nil; child = child.NextSibling {
if child.Type == html.ElementNode {
cellTag := strings.ToLower(child.Data)
if cellTag == "th" {
row.isHeader = true
row.cells = append(row.cells, strings.TrimSpace(getTextContent(child)))
} else if cellTag == "td" {
row.cells = append(row.cells, strings.TrimSpace(getTextContent(child)))
}
}
}
rows = append(rows, row)
return
}
}
for child := n.FirstChild; child != nil; child = child.NextSibling {
walk(child, inHeader)
}
}
walk(table, false)
return rows
}
type imageRef struct {
url string
filename string
isAttachment bool
}
func (c *xmlConverter) findImageRef(n *html.Node) imageRef {
var ref imageRef
var walk func(*html.Node)
walk = func(node *html.Node) {
if node.Type == html.ElementNode {
tag := strings.ToLower(node.Data)
// <ri:url ri:value="..."/>
if strings.Contains(tag, "url") {
v := getAttr(node, "ri:value")
if v == "" {
v = getAttr(node, "value")
}
if v != "" {
ref.url = v
return
}
}
// <ri:attachment ri:filename="..."/>
if strings.Contains(tag, "attachment") {
f := getAttr(node, "ri:filename")
if f == "" {
f = getAttr(node, "filename")
}
if f != "" {
ref.filename = f
ref.isAttachment = true
return
}
}
}
for child := node.FirstChild; child != nil; child = child.NextSibling {
walk(child)
}
}
walk(n)
return ref
}
func (c *xmlConverter) hasUserChild(n *html.Node) bool {
for child := n.FirstChild; child != nil; child = child.NextSibling {
if child.Type == html.ElementNode {
tag := strings.ToLower(child.Data)
if strings.Contains(tag, "user") {
return true
}
}
}
return false
}
// Helper functions
func findNode(n *html.Node, tag string) *html.Node {
if n.Type == html.ElementNode && n.Data == tag {
return n
}
for child := n.FirstChild; child != nil; child = child.NextSibling {
if found := findNode(child, tag); found != nil {
return found
}
}
return nil
}
func getAttr(n *html.Node, key string) string {
for _, attr := range n.Attr {
attrKey := attr.Key
if attr.Namespace != "" {
attrKey = attr.Namespace + ":" + attr.Key
}
if attrKey == key {
return attr.Val
}
}
return ""
}
// collapseWhitespace replaces runs of whitespace with a single space,
// preserving leading/trailing single space if original had whitespace there.
func collapseWhitespace(s string) string {
var buf strings.Builder
inWS := false
for _, r := range s {
if r == ' ' || r == '\t' || r == '\n' || r == '\r' {
if !inWS {
buf.WriteByte(' ')
inWS = true
}
} else {
buf.WriteRune(r)
inWS = false
}
}
return buf.String()
}
// hasTaskStatus checks if a node contains a task-status element.
func hasTaskStatus(n *html.Node) bool {
for child := n.FirstChild; child != nil; child = child.NextSibling {
if child.Type == html.ElementNode {
tag := strings.ToLower(child.Data)
if strings.Contains(tag, "task-status") {
return true
}
}
}
return false
}
// getCDATAContent retrieves content from preprocessed CDATA sections.
// It looks for <cdatacontent> elements and unescapes their text.
func getCDATAContent(n *html.Node) string {
var buf bytes.Buffer
var walk func(*html.Node)
walk = func(node *html.Node) {
if node.Type == html.ElementNode && node.Data == "cdatacontent" {
text := getTextContent(node)
buf.WriteString(htmlpkg.UnescapeString(text))
return
}
if node.Type == html.TextNode {
buf.WriteString(node.Data)
}
for child := node.FirstChild; child != nil; child = child.NextSibling {
walk(child)
}
}
walk(n)
return buf.String()
}
func getTextContent(n *html.Node) string {
var buf bytes.Buffer
var walk func(*html.Node)
walk = func(node *html.Node) {
if node.Type == html.TextNode {
buf.WriteString(node.Data)
}
for child := node.FirstChild; child != nil; child = child.NextSibling {
walk(child)
}
}
walk(n)
return buf.String()
}