package converter import ( "bytes" "fmt" htmlpkg "html" "strings" "golang.org/x/net/html" ) // ConfluenceToMarkdown converts Confluence storage format XML to Markdown. func ConfluenceToMarkdown(source string) (string, error) { // Preprocess: extract CDATA content and replace with escaped text, // because x/net/html doesn't handle CDATA sections. preprocessed := preprocessCDATA(source) // Wrap in a root element so the HTML parser handles it correctly. wrapped := "
" + preprocessed + "
" doc, err := html.Parse(strings.NewReader(wrapped)) if err != nil { return "", fmt.Errorf("parsing confluence xml: %w", err) } var buf bytes.Buffer c := &xmlConverter{buf: &buf} // Navigate to the wrapper div: html > head > body > div body := findNode(doc, "body") if body == nil { return "", fmt.Errorf("unexpected parse structure") } wrapper := body.FirstChild if wrapper != nil { c.walkChildren(wrapper, 0) } result := buf.String() // Clean up excessive blank lines for strings.Contains(result, "\n\n\n") { result = strings.ReplaceAll(result, "\n\n\n", "\n\n") } return strings.TrimSpace(result) + "\n", nil } // preprocessCDATA replaces with the content as a data attribute // on the parent element, since x/net/html doesn't parse CDATA. func preprocessCDATA(s string) string { var result strings.Builder for { idx := strings.Index(s, "") if endIdx == -1 { result.WriteString(s) break } // Write CDATA content as a special element that we can detect content := s[:endIdx] result.WriteString("") result.WriteString(htmlpkg.EscapeString(content)) result.WriteString("") s = s[endIdx+len("]]>"):] } return result.String() } type xmlConverter struct { buf *bytes.Buffer listDepth int inListItem bool } func (c *xmlConverter) walkChildren(n *html.Node, depth int) { for child := n.FirstChild; child != nil; child = child.NextSibling { c.walk(child, depth) } } func (c *xmlConverter) walk(n *html.Node, depth int) { if n.Type == html.TextNode { text := n.Data // Skip whitespace-only text nodes inside lists if c.listDepth > 0 && strings.TrimSpace(text) == "" { return } // Collapse whitespace in text nodes (XML indentation artifacts) if strings.TrimSpace(text) != "" { // Replace sequences of whitespace (including newlines) with single space, // but preserve the trimmed content text = collapseWhitespace(text) } c.buf.WriteString(text) return } if n.Type != html.ElementNode { c.walkChildren(n, depth) return } tag := strings.ToLower(n.Data) switch { // Headings case tag == "h1": c.buf.WriteString("\n# ") c.walkChildren(n, depth) c.buf.WriteString("\n\n") case tag == "h2": c.buf.WriteString("\n## ") c.walkChildren(n, depth) c.buf.WriteString("\n\n") case tag == "h3": c.buf.WriteString("\n### ") c.walkChildren(n, depth) c.buf.WriteString("\n\n") case tag == "h4": c.buf.WriteString("\n#### ") c.walkChildren(n, depth) c.buf.WriteString("\n\n") case tag == "h5": c.buf.WriteString("\n##### ") c.walkChildren(n, depth) c.buf.WriteString("\n\n") case tag == "h6": c.buf.WriteString("\n###### ") c.walkChildren(n, depth) c.buf.WriteString("\n\n") // Paragraphs case tag == "p": c.walkChildren(n, depth) if !c.inListItem { c.buf.WriteString("\n\n") } // Inline formatting case tag == "strong", tag == "b": c.buf.WriteString("**") c.walkChildren(n, depth) c.buf.WriteString("**") case tag == "em", tag == "i": c.buf.WriteString("*") c.walkChildren(n, depth) c.buf.WriteString("*") case tag == "del", tag == "s": c.buf.WriteString("~~") c.walkChildren(n, depth) c.buf.WriteString("~~") case tag == "code": c.buf.WriteString("`") c.walkChildren(n, depth) c.buf.WriteString("`") // Links case tag == "a": href := getAttr(n, "href") c.buf.WriteString("[") c.walkChildren(n, depth) c.buf.WriteString("](") c.buf.WriteString(href) c.buf.WriteString(")") // Line break case tag == "br": c.buf.WriteString(" \n") // Horizontal rule case tag == "hr": c.buf.WriteString("\n---\n\n") // Lists case tag == "ul": c.listDepth++ if c.listDepth == 1 { c.buf.WriteString("\n") } c.walkChildren(n, depth) c.listDepth-- if c.listDepth == 0 { c.buf.WriteString("\n") } case tag == "ol": c.listDepth++ if c.listDepth == 1 { c.buf.WriteString("\n") } c.walkOL(n, depth) c.listDepth-- if c.listDepth == 0 { c.buf.WriteString("\n") } case tag == "li": prev := c.inListItem c.inListItem = true // Check if this list item contains a task checkbox if hasTaskStatus(n) { // Task status handler will write the prefix, walkChildrenInline for text c.walkChildrenInline(n, depth) c.buf.WriteString("\n") } else { indent := strings.Repeat(" ", max(0, c.listDepth-1)) c.buf.WriteString(indent) c.buf.WriteString("- ") c.walkChildrenInline(n, depth) c.buf.WriteString("\n") } c.inListItem = prev // Tables - convert to GFM table case tag == "table": c.renderTable(n, depth) // Confluence macros - handled via ac:* namespace (parsed as ac-*) // The HTML parser lowercases and handles colons differently. // We need to handle both ac:structured-macro and the parsed form. // Skip layout/structural elements, pass through children case tag == "div", tag == "span", tag == "tbody", tag == "thead", tag == "colgroup", tag == "col", tag == "content-wrapper": c.walkChildren(n, depth) // Handle Confluence-specific elements default: c.handleConfluenceElement(n, tag, depth) } } func (c *xmlConverter) handleConfluenceElement(n *html.Node, tag string, depth int) { switch { // Confluence structured macros (code blocks, panels, etc.) case strings.Contains(tag, "structured-macro") || strings.Contains(tag, "ac:structured-macro"): macroName := getAttr(n, "ac:name") if macroName == "" { macroName = getAttr(n, "name") } switch macroName { case "code": c.renderCodeMacro(n) case "info": c.renderPanelAsBlockquote(n, depth) case "note": c.renderPanelAsBlockquote(n, depth) case "warning": c.renderPanelAsBlockquote(n, depth) case "toc": // Skip TOC macros default: c.walkChildren(n, depth) } // Confluence images case strings.Contains(tag, "image") || strings.Contains(tag, "ac:image"): alt := getAttr(n, "ac:alt") if alt == "" { alt = getAttr(n, "alt") } imgRef := c.findImageRef(n) if imgRef.isAttachment { // Preserve attachment reference as round-trippable HTML fmt.Fprintf(c.buf, `") } else { c.buf.WriteString("![") c.buf.WriteString(alt) c.buf.WriteString("](") c.buf.WriteString(imgRef.url) c.buf.WriteString(")") } // Confluence links (user mentions, page links) case strings.Contains(tag, "ac:link"): if c.hasUserChild(n) { c.walkChildren(n, depth) } else { c.walkChildren(n, depth) } // Confluence emoticons case strings.Contains(tag, "emoticon") || strings.Contains(tag, "ac:emoticon"): name := getAttr(n, "ac:name") if name == "" { name = getAttr(n, "name") } switch name { case "plus": c.buf.WriteString("(+)") case "minus": c.buf.WriteString("(-)") case "question": c.buf.WriteString("(?)") case "tick": c.buf.WriteString("(v)") case "cross": c.buf.WriteString("(x)") } // Confluence task lists case strings.Contains(tag, "task-list"): c.listDepth++ c.walkChildren(n, depth) c.listDepth-- case strings.Contains(tag, "task-body"): c.walkChildren(n, depth) c.buf.WriteString("\n") case strings.Contains(tag, "task-status"): status := strings.TrimSpace(getTextContent(n)) indent := strings.Repeat(" ", max(0, c.listDepth-1)) if status == "complete" { c.buf.WriteString(indent + "- [x] ") } else { c.buf.WriteString(indent + "- [ ] ") } case strings.Contains(tag, "task-id"): // Skip task IDs case strings.Contains(tag, "task") && !strings.Contains(tag, "task-"): c.walkChildren(n, depth) // Confluence inline comment markers — preserve as span with data attribute case strings.Contains(tag, "inline-comment-marker"): ref := getAttr(n, "ac:ref") if ref == "" { ref = getAttr(n, "ref") } if ref != "" { fmt.Fprintf(c.buf, ``, ref) c.walkChildren(n, depth) c.buf.WriteString("") } else { c.walkChildren(n, depth) } // User references — preserve as round-trippable HTML span case strings.Contains(tag, "ri:user"): userKey := getAttr(n, "ri:userkey") if userKey == "" { userKey = getAttr(n, "userkey") } if userKey != "" { fmt.Fprintf(c.buf, ``, userKey) } // Time elements case tag == "time": datetime := getAttr(n, "datetime") if datetime != "" { c.buf.WriteString(datetime) } // Fallback: just walk children default: c.walkChildren(n, depth) } } func (c *xmlConverter) renderCodeMacro(n *html.Node) { language := "" code := "" // Walk children to find parameters and body var walkMacro func(*html.Node) walkMacro = func(node *html.Node) { if node.Type == html.ElementNode { tag := strings.ToLower(node.Data) if strings.Contains(tag, "parameter") || strings.Contains(tag, "ac:parameter") { name := getAttr(node, "ac:name") if name == "" { name = getAttr(node, "name") } if name == "language" { language = getTextContent(node) } } if strings.Contains(tag, "plain-text-body") || strings.Contains(tag, "ac:plain-text-body") { code = getCDATAContent(node) } } for child := node.FirstChild; child != nil; child = child.NextSibling { walkMacro(child) } } walkMacro(n) c.buf.WriteString("\n```") c.buf.WriteString(language) c.buf.WriteString("\n") c.buf.WriteString(code) if !strings.HasSuffix(code, "\n") { c.buf.WriteString("\n") } c.buf.WriteString("```\n\n") } func (c *xmlConverter) renderPanelAsBlockquote(n *html.Node, depth int) { // Collect panel body content var bodyBuf bytes.Buffer origBuf := c.buf c.buf = &bodyBuf // Find rich-text-body and walk it var findBody func(*html.Node) findBody = func(node *html.Node) { if node.Type == html.ElementNode { tag := strings.ToLower(node.Data) if strings.Contains(tag, "rich-text-body") { c.walkChildren(node, depth) return } } for child := node.FirstChild; child != nil; child = child.NextSibling { findBody(child) } } findBody(n) c.buf = origBuf text := strings.TrimSpace(bodyBuf.String()) lines := strings.Split(text, "\n") for _, line := range lines { c.buf.WriteString("> ") c.buf.WriteString(line) c.buf.WriteString("\n") } c.buf.WriteString("\n") } func (c *xmlConverter) renderTable(n *html.Node, depth int) { rows := collectTableRows(n) if len(rows) == 0 { return } // Determine column count cols := 0 for _, row := range rows { if len(row.cells) > cols { cols = len(row.cells) } } if cols == 0 { return } c.buf.WriteString("\n") // If first row is a header isFirstRowHeader := len(rows) > 0 && rows[0].isHeader startIdx := 0 if isFirstRowHeader { c.writeTableRow(rows[0].cells, cols) c.writeTableSep(cols) startIdx = 1 } else { // Write empty header and separator empty := make([]string, cols) c.writeTableRow(empty, cols) c.writeTableSep(cols) } for i := startIdx; i < len(rows); i++ { c.writeTableRow(rows[i].cells, cols) } c.buf.WriteString("\n") } func (c *xmlConverter) writeTableRow(cells []string, cols int) { c.buf.WriteString("|") for i := range cols { cell := "" if i < len(cells) { cell = cells[i] } c.buf.WriteString(" ") c.buf.WriteString(cell) c.buf.WriteString(" |") } c.buf.WriteString("\n") } func (c *xmlConverter) writeTableSep(cols int) { c.buf.WriteString("|") for range cols { c.buf.WriteString("---|") } c.buf.WriteString("\n") } func (c *xmlConverter) walkOL(n *html.Node, depth int) { idx := 1 for child := n.FirstChild; child != nil; child = child.NextSibling { if child.Type != html.ElementNode { continue } tag := strings.ToLower(child.Data) if tag == "li" { indent := strings.Repeat(" ", max(0, c.listDepth-1)) c.buf.WriteString(indent) fmt.Fprintf(c.buf, "%d. ", idx) c.walkChildrenInline(child, depth) c.buf.WriteString("\n") idx++ } } } func (c *xmlConverter) walkChildrenInline(n *html.Node, depth int) { for child := n.FirstChild; child != nil; child = child.NextSibling { if child.Type == html.TextNode { // Collapse whitespace but preserve a single space between inline elements text := collapseWhitespace(child.Data) // Only trim leading space if this is the very first child if child == n.FirstChild { text = strings.TrimLeft(text, " ") } // Only trim trailing space if this is the very last child if child.NextSibling == nil { text = strings.TrimRight(text, " ") } if text != "" { c.buf.WriteString(text) } continue } if child.Type == html.ElementNode { tag := strings.ToLower(child.Data) switch { case tag == "p": c.walkChildrenInline(child, depth) case tag == "ul", tag == "ol": c.buf.WriteString("\n") c.walk(child, depth) default: c.walk(child, depth) } } } } type tableRow struct { isHeader bool cells []string } func collectTableRows(table *html.Node) []tableRow { var rows []tableRow var walk func(*html.Node, bool) walk = func(n *html.Node, inHeader bool) { if n.Type == html.ElementNode { tag := strings.ToLower(n.Data) switch tag { case "thead": for child := n.FirstChild; child != nil; child = child.NextSibling { walk(child, true) } return case "tbody": for child := n.FirstChild; child != nil; child = child.NextSibling { walk(child, false) } return case "tr": row := tableRow{isHeader: inHeader} for child := n.FirstChild; child != nil; child = child.NextSibling { if child.Type == html.ElementNode { cellTag := strings.ToLower(child.Data) if cellTag == "th" { row.isHeader = true row.cells = append(row.cells, strings.TrimSpace(getTextContent(child))) } else if cellTag == "td" { row.cells = append(row.cells, strings.TrimSpace(getTextContent(child))) } } } rows = append(rows, row) return } } for child := n.FirstChild; child != nil; child = child.NextSibling { walk(child, inHeader) } } walk(table, false) return rows } type imageRef struct { url string filename string isAttachment bool } func (c *xmlConverter) findImageRef(n *html.Node) imageRef { var ref imageRef var walk func(*html.Node) walk = func(node *html.Node) { if node.Type == html.ElementNode { tag := strings.ToLower(node.Data) // if strings.Contains(tag, "url") { v := getAttr(node, "ri:value") if v == "" { v = getAttr(node, "value") } if v != "" { ref.url = v return } } // if strings.Contains(tag, "attachment") { f := getAttr(node, "ri:filename") if f == "" { f = getAttr(node, "filename") } if f != "" { ref.filename = f ref.isAttachment = true return } } } for child := node.FirstChild; child != nil; child = child.NextSibling { walk(child) } } walk(n) return ref } func (c *xmlConverter) hasUserChild(n *html.Node) bool { for child := n.FirstChild; child != nil; child = child.NextSibling { if child.Type == html.ElementNode { tag := strings.ToLower(child.Data) if strings.Contains(tag, "user") { return true } } } return false } // Helper functions func findNode(n *html.Node, tag string) *html.Node { if n.Type == html.ElementNode && n.Data == tag { return n } for child := n.FirstChild; child != nil; child = child.NextSibling { if found := findNode(child, tag); found != nil { return found } } return nil } func getAttr(n *html.Node, key string) string { for _, attr := range n.Attr { attrKey := attr.Key if attr.Namespace != "" { attrKey = attr.Namespace + ":" + attr.Key } if attrKey == key { return attr.Val } } return "" } // collapseWhitespace replaces runs of whitespace with a single space, // preserving leading/trailing single space if original had whitespace there. func collapseWhitespace(s string) string { var buf strings.Builder inWS := false for _, r := range s { if r == ' ' || r == '\t' || r == '\n' || r == '\r' { if !inWS { buf.WriteByte(' ') inWS = true } } else { buf.WriteRune(r) inWS = false } } return buf.String() } // hasTaskStatus checks if a node contains a task-status element. func hasTaskStatus(n *html.Node) bool { for child := n.FirstChild; child != nil; child = child.NextSibling { if child.Type == html.ElementNode { tag := strings.ToLower(child.Data) if strings.Contains(tag, "task-status") { return true } } } return false } // getCDATAContent retrieves content from preprocessed CDATA sections. // It looks for elements and unescapes their text. func getCDATAContent(n *html.Node) string { var buf bytes.Buffer var walk func(*html.Node) walk = func(node *html.Node) { if node.Type == html.ElementNode && node.Data == "cdatacontent" { text := getTextContent(node) buf.WriteString(htmlpkg.UnescapeString(text)) return } if node.Type == html.TextNode { buf.WriteString(node.Data) } for child := node.FirstChild; child != nil; child = child.NextSibling { walk(child) } } walk(n) return buf.String() } func getTextContent(n *html.Node) string { var buf bytes.Buffer var walk func(*html.Node) walk = func(node *html.Node) { if node.Type == html.TextNode { buf.WriteString(node.Data) } for child := node.FirstChild; child != nil; child = child.NextSibling { walk(child) } } walk(n) return buf.String() }