harshankur · harshankur · Apr 21, 2026 · Apr 18, 2026
diff --git a/README.md b/README.md
@@ -59,6 +59,7 @@ npx officeparser /path/to/officeFile.docx --ignoreNotes=true --newlineDelimiter=
 - `--extractAttachments=[true|false]`   Flag to extract images/charts as Base64. Default is false.
 - `--ocr=[true|false]`                  Flag to enable OCR for extracted images. Default is false.
 - `--includeRawContent=[true|false]`    Flag to include raw XML/RTF content in nodes. Default is false.
+- `--includeBreakNodes=[true|false]`    Flag to include break nodes. Currently only available for DOCX documents
 - `--verbose=[true|false]`              Show full error stack traces.
 
 
@@ -277,13 +278,28 @@ Formatting can be found at two levels:
 1.  **Node Level**: Applied directly to a text run or paragraph.
 2.  **Document Level**: Found in `ast.metadata.formatting` (defaults) or `ast.metadata.styleMap` (named styles).
 
-### 6. Advanced Metadata
+### 6. Breaks
+Breaks are currently only supported when parsing DOCX-documents. Breaks are added as a node of type `break` and carry metadata of the type `BreakMetadata`.
+
+```text
+Break Node
+├── type: "break"
+└── metadata: {
+        breakType: "lineWrapping" | "page" | "column",
+        clear?: "all" | "left" | "none" | "right"
+    }
+```
+
+- `breakType`: Type of break. "lineWrapping" means just a simple line break, "page" a page break and "column" a break to the next column
+- `clear`: This field is only relevant when `breakType` is set to "lineWrapping". This indicates how breaking to next line should be handled when there are e.g. floating objects in the document.
+
+### 7. Advanced Metadata
 The `ast.metadata` object provides document-wide context:
 - **`styleMap`**: A dictionary of style names to their `TextFormatting` definitions found in the document.
 - **`formatting`**: Document-wide default settings (e.g., default font or font size).
 - **`customProperties`**: A dictionary of user-defined metadata embedded in the document (OOXML `custom.xml`, ODF `meta:user-defined`, or PDF Info dictionary).
 
-### 7. Custom Properties
+### 8. Custom Properties
 You can access custom user-defined metadata that might be embedded in the document:
 
 ```javascript
@@ -403,6 +419,7 @@ Pass an optional config object as the second argument to `parseOffice`.
 | `ocrConfig.workerPath` | string | `undefined` | Path to Tesseract worker script (for offline use). |
 | `ocrConfig.corePath` | string | `undefined` | Path to Tesseract core script (for offline use). |
 | `ocrConfig.langPath` | string | `undefined` | Path for Tesseract language files (for offline use). |
+| `includeBreakNodes` | boolean | `false` | Specifically targets Word documents (DOCX). When set to true, officeParser will also parse `w:br`, `w:cr` and `w:lastRenderedPageBreak` nodes.|
 
 ### OCR Scheduler & Resource Management
 If your application uses OCR, `officeParser` utilizes an intelligent **Smart Worker Pool** to maintain a background worker pool and optimize repeated parse requests.

diff --git a/docs/dist/officeparser.browser.iife.js b/docs/dist/officeparser.browser.iife.js
diff --git a/docs/dist/officeparser.browser.mjs b/docs/dist/officeparser.browser.mjs
diff --git a/docs/specs/ast_fragment.html b/docs/specs/ast_fragment.html
@@ -138,6 +138,11 @@ <h3>4. Rich Content</h3>
             <code>chartData</code> in metadata if requested.</li>
     </ul>
 
+    <h3>5. Other</h3>
+    <ul>
+        <li><strong><code>break</code></strong>: Currently only available for Word documents. <code>breakType</code> in metadata can be used to differentiate between line, (soft) page and column breaks.</li>
+    </ul>
+
     <h2>Text Formatting</h2>
     <p>Applied to <code>text</code>, <code>paragraph</code>, or <code>heading</code> nodes.</p>
     <pre><code>{

diff --git a/docs/specs/config_fragment.html b/docs/specs/config_fragment.html
@@ -121,6 +121,12 @@ <h2>OfficeParserConfig</h2>
                 <td><code>undefined</code></td>
                 <td><strong>Offline Environments</strong>: Paths to Tesseract scripts and data files for air-gapped or customized installations.</td>
             </tr>
+            <tr>
+                <td><code>includeBreakNodes</code></td>
+                <td><code>boolean</code></td>
+                <td><code>false</code></td>
+                <td><strong>Line/Page Break Parsing</strong>: Specifically targets Word documents (DOCX). When set to true, <code>officeParser</code> will also parse <code>w:br</code>, <code>w:cr</code> and <code>w:lastRenderedPageBreak</code> nodes.</td>
+            </tr>
         </tbody>
     </table>
 

diff --git a/src/OfficeParser.ts b/src/OfficeParser.ts
@@ -121,6 +121,7 @@ export class OfficeParser {
             preserveXmlWhitespace: false,
             pdfWorkerSrc: '',
             ocrConfig: {},
+            includeBreakNodes: false,
             ...actualConfig
         };
 

diff --git a/src/cli.ts b/src/cli.ts
@@ -102,6 +102,7 @@ if (fileArg) {
     console.log('  --includeRawContent=true     Include raw content in AST');
     console.log('  --serializeRawContent=true   Serialize raw XML content (default: true)');
     console.log('  --preserveXmlWhitespace=true Preserve whitespace in serialized XML (default: false)');
+    console.log('  --includeBreakNodes=false    Include break nodes (DOCX only, default: false)');
     console.log('  --verbose=true               Show full error stack traces');
     console.log('');
     console.log('Examples:');

diff --git a/src/index.ts b/src/index.ts
@@ -62,7 +62,8 @@ import {
     CellMetadata,
     ImageMetadata,
     PageMetadata,
-    ContentMetadata
+    ContentMetadata,
+    BreakMetadata,
 } from './types';
 
 const parseOffice = OfficeParser.parseOffice;
@@ -88,11 +89,9 @@ export {
     CellMetadata,
     ImageMetadata,
     PageMetadata,
-    ContentMetadata
+    ContentMetadata,
+    BreakMetadata,
 };
 
-
-
 // Default export for backward compatibility
 export default OfficeParser;
-
diff --git a/src/parsers/WordParser.ts b/src/parsers/WordParser.ts
@@ -34,17 +34,18 @@
  *   </w:body>
  * </w:document>
  * ```
- * 
+ *
  * **Key OOXML Elements:**
  * - `<w:p>` - Paragraph
  * - `<w:r>` - Run (contiguous text with same formatting)
  * - `<w:t>` - Text content
+ * - `<w:br>` - Line or page break
  * - `<w:b>`, `<w:i>`, `<w:u>` - Bold, italic, underline
  * - `<w:pStyle>` - Paragraph style (for headings)
  * - `<w:numPr>` - List numbering properties
  * - `<w:tbl>` - Table
  * - `<w:drawing>` - Drawing/image
- * 
+ *
  * **Parsing Approach:**
  * 1. Extract ZIP contents
  * 2. Parse word/document.xml for structure and text
@@ -53,13 +54,13 @@
  * 5. Extract footnotes from word/footnotes.xml
  * 6. Process embedded images from word/media/*
  * 7. Parse metadata from docProps/core.xml
- * 
+ *
  * @module WordParser
  * @see https://www.ecma-international.org/publications-and-standards/standards/ecma-376/ OOXML Standard
  * @see https://learn.microsoft.com/en-us/openspecs/office_standards/ms-docx/ [MS-DOCX] Specification
  */
 
-import { ImageMetadata, ListMetadata, OfficeAttachment, OfficeContentNode, OfficeParserAST, OfficeParserConfig, TextFormatting, TextMetadata } from '../types.js';
+import { BreakMetadata, ImageMetadata, ListMetadata, OfficeAttachment, OfficeContentNode, OfficeParserAST, OfficeParserConfig, TextFormatting, TextMetadata } from '../types.js';
 import { logWarning } from '../utils/errorUtils.js';
 import { createAttachment } from '../utils/imageUtils.js';
 import { performOcr } from '../utils/ocrUtils.js';
@@ -444,26 +445,70 @@ export const parseWord = async (buffer: Buffer, config: OfficeParserConfig): Pro
                     formatting.backgroundColor = paraBackgroundColor;
                 }
 
-                // Text content
-                const tNodes = getElementsByTagName(runNode, "w:t");
-                for (const tNode of tNodes) {
-                    const tContent = tNode.textContent || '';
-                    text += tContent;
-                    const textNode: OfficeContentNode = {
-                        type: 'text',
-                        text: tContent,
-                        formatting: formatting
-                    };
-                    if (config.includeRawContent) {
-                        textNode.rawContent = getRawContent(tNode, documentContent, config);
+                for (const child of runNode.childNodes) {
+                    if (!isElement(child)) continue;
+
+                    // also handle unprefixed version (mirroring the behaviour of getElementsByTagName)
+
+                    // Text content
+                    if (child.tagName === "w:t" || child.tagName === "t") {
+                        const tNode = child;
+
+                        const tContent = tNode.textContent || '';
+                        text += tContent;
+                        const textNode: OfficeContentNode = {
+                            type: 'text',
+                            text: tContent,
+                            formatting: formatting
+                        };
+                        if (config.includeRawContent) {
+                            textNode.rawContent = getRawContent(tNode, documentContent, config);
+                        }
+                        // Always set a style: run style > paragraph style > detected default
+                        // Use detected default style for international compatibility
+                        const nodeStyle = rStyleVal || pStyleVal || defaultParaStyleId;
+                        if (nodeStyle) {
+                            textNode.metadata = { style: nodeStyle };
+                        }
+                        children.push(textNode);
                     }
-                    // Always set a style: run style > paragraph style > detected default
-                    // Use detected default style for international compatibility
-                    const nodeStyle = rStyleVal || pStyleVal || defaultParaStyleId;
-                    if (nodeStyle) {
-                        textNode.metadata = { style: nodeStyle };
+                    // Break nodes
+                    else if (config.includeBreakNodes &&
+                            (child.tagName === "w:br"
+                                || child.tagName === "br"
+                                || child.tagName === "w:cr"
+                                || child.tagName === "cr")
+                        ) {
+                        const brNode = child;
+
+                        const nodeBreakType = brNode.getAttribute("w:type");
+
+                        // 'textWrapping' is the default break type that should be
+                        // used when w:type is not specified
+                        let breakType: BreakMetadata['breakType'] = 'textWrapping';
+                        if (nodeBreakType !== null) {
+                            breakType = nodeBreakType as BreakMetadata['breakType'];
+                        }
+
+                        let breakClear: BreakMetadata["clear"] = undefined;
+                        if (breakType === 'textWrapping' && brNode.getAttribute("w:clear") !== null) {
+                            breakClear = brNode.getAttribute("w:clear") as BreakMetadata["clear"];
+                        }
+
+                        const breakNode: OfficeContentNode = {
+                            type: 'break',
+                            metadata: { breakType, clear: breakClear }
+                        };
+
+                        children.push(breakNode);
+                    } else if (config.includeBreakNodes && (child.tagName === "w:lastRenderedPageBreak" || child.tagName === "lastRenderedPageBreak")) {
+                        const breakNode: OfficeContentNode = {
+                            type: 'break',
+                            metadata: { breakType: 'lastRenderedPage' }
+                        };
+
+                        children.push(breakNode);
                     }
-                    children.push(textNode);
                 }
 
                 // Images/Drawings

diff --git a/src/types.ts b/src/types.ts
@@ -112,12 +112,19 @@ export interface OfficeParserConfig {
     preserveXmlWhitespace?: boolean;
     /**
      * The URL/path to the PDF.js worker script.
-     * 
+     *
      * **Mandatory** when using PDF parsing in browser environments to avoid worker configuration errors.
      * If not provided, it defaults to `https://unpkg.com/pdfjs-dist@5.6.205/build/pdf.worker.min.mjs`.
      * You can override this with your own local path or a different CDN link.
      */
     pdfWorkerSrc?: string;
+    /**
+     * Flag to include break nodes in the AST.
+     * This is currently only supported for Word documents. (w:br nodes)
+     *
+     * Default is false
+     */
+    includeBreakNodes?: boolean;
 }
 
 /**
@@ -128,7 +135,7 @@ export type SupportedFileType = 'docx' | 'pptx' | 'xlsx' | 'odt' | 'odp' | 'ods'
 /**
  * Types of content nodes in the AST.
  */
-export type OfficeContentNodeType = 'paragraph' | 'heading' | 'table' | 'list' | 'text' | 'image' | 'chart' | 'drawing' | 'slide' | 'note' | 'sheet' | 'row' | 'cell' | 'page';
+export type OfficeContentNodeType = 'paragraph' | 'heading' | 'table' | 'list' | 'text' | 'image' | 'chart' | 'drawing' | 'slide' | 'note' | 'sheet' | 'row' | 'cell' | 'page' | 'break';
 
 /**
  * Supported MIME types for attachments.
@@ -438,10 +445,36 @@ export interface NoteMetadata {
     noteId?: string;
 }
 
+/**
+ * Metadata for break nodes.
+ * Used in DOCX files to track line and page breaks.
+ */
+export interface BreakMetadata {
+    /**
+     * Type of break. The break type determines the next location where
+     * text shall be placed.
+     * - 'column': The next text will be placed in the next column.
+     * - 'page': The next text will be placed on the next page.
+     * - 'lastRenderedPage': The editing application has inserted a soft break on the last save.
+     * - 'textWrapping' (default, assumed when not specified): The next text will be placed on the next line.
+     */
+    breakType:  'column' | 'page' | 'lastRenderedPage' | 'textWrapping';
+
+    /**
+     * Specifies the location which shall be used as the next available line when breakType
+     * has a value of 'textWrapping'. Should be ignored for other break types.
+     * - 'all': text wrapping break shall advance the text to the next line which spans the full width of the line
+     * - 'left': text wrapping break shall restart in next text region unblocked on the left
+     * - 'none': text wrapping break shall advance the text to the next line regardless of any floating objects
+     * - 'right': text wrapping break shall restart in next text region unblocked on the right
+     */
+    clear?: 'all' | 'left' | 'none' | 'right';
+}
+
 /**
  * Union type for content metadata.
  */
-export type ContentMetadata = SlideMetadata | SheetMetadata | HeadingMetadata | ListMetadata | CellMetadata | ImageMetadata | ChartMetadata | PageMetadata | ParagraphMetadata | TextMetadata | NoteMetadata | undefined;
+export type ContentMetadata = SlideMetadata | SheetMetadata | HeadingMetadata | ListMetadata | CellMetadata | ImageMetadata | ChartMetadata | PageMetadata | ParagraphMetadata | TextMetadata | NoteMetadata | BreakMetadata | undefined;
 
 
 /**

diff --git a/test/baseline/test.docx.json b/test/baseline/test.docx.json
@@ -412,9 +412,16 @@
     {
       "type": "paragraph",
       "text": "",
-      "children": [],
+      "children": [
+        {
+          "type": "break",
+          "metadata": {
+            "breakType": "textWrapping"
+          }
+        }
+      ],
       "metadata": {},
-      "rawContent": "<w:p xmlns:w=\"http://schemas.openxmlformats.org/wordprocessingml/2006/main\" w:rsidR=\"00386659\" w:rsidP=\"00230C50\" w:rsidRDefault=\"00386659\" xmlns:w14=\"http://schemas.microsoft.com/office/word/2010/wordml\" w14:paraId=\"672A6659\" wp14:textId=\"77777777\" xmlns:wp14=\"http://schemas.microsoft.com/office/word/2010/wordml\"/>"
+      "rawContent": "<w:p xmlns:w=\"http://schemas.openxmlformats.org/wordprocessingml/2006/main\" w:rsidR=\"00386659\" w:rsidP=\"00230C50\" w:rsidRDefault=\"00386659\" xmlns:w14=\"http://schemas.microsoft.com/office/word/2010/wordml\" w14:paraId=\"672A6659\" wp14:textId=\"77777777\" xmlns:wp14=\"http://schemas.microsoft.com/office/word/2010/wordml\"><w:r><w:br type=\"page\"/></w:r></w:p>"
     },
     {
       "type": "heading",
@@ -477,10 +484,16 @@
           "metadata": {
             "style": "Normal"
           }
+        },
+        {
+          "type": "break",
+          "metadata": {
+            "breakType": "textWrapping"
+          }
         }
       ],
       "metadata": {},
-      "rawContent": "<w:p xmlns:w=\"http://schemas.openxmlformats.org/wordprocessingml/2006/main\" w:rsidR=\"00386659\" w:rsidP=\"00386659\" w:rsidRDefault=\"00386659\" xmlns:w14=\"http://schemas.microsoft.com/office/word/2010/wordml\" w14:paraId=\"52539654\" wp14:textId=\"77777777\" xmlns:wp14=\"http://schemas.microsoft.com/office/word/2010/wordml\"><w:r><w:t>Here, we demonstrate various types of inline text formatting and the use of embedded fonts.</w:t></w:r></w:p>"
+      "rawContent": "<w:p xmlns:w=\"http://schemas.openxmlformats.org/wordprocessingml/2006/main\" w:rsidR=\"00386659\" w:rsidP=\"00386659\" w:rsidRDefault=\"00386659\" xmlns:w14=\"http://schemas.microsoft.com/office/word/2010/wordml\" w14:paraId=\"52539654\" wp14:textId=\"77777777\" xmlns:wp14=\"http://schemas.microsoft.com/office/word/2010/wordml\"><w:r><w:t>Here, we demonstrate various types of inline text formatting and the use of embedded fonts.</w:t><w:br/></w:r></w:p>"
     },
     {
       "type": "paragraph",

diff --git a/test/files/test.docx b/test/files/test.docx
diff --git a/test/testOfficeParser.ts b/test/testOfficeParser.ts
@@ -49,7 +49,8 @@ const FULL_CONFIG: Required<OfficeParserConfig> = {
     outputErrorToConsole: true,
     pdfWorkerSrc: '',
     serializeRawContent: true,
-    preserveXmlWhitespace: false
+    preserveXmlWhitespace: false,
+    includeBreakNodes: true
 };
 
 /** Config permutations to test */