Extract document content separated by page breaks.

Share on FacebookTweet about this on TwitterShare on LinkedIn

In Aspose.Words, it is pretty easy to extract document content separated by section breaks, as this content is effectively put in separate Section nodes in Aspose.Words object model.

But what if we need to extract content separated by page breaks, as requested by one of our users in the following forum post:

http://www.aspose.com/COMMUNITY/forums/ShowThread.aspx?PostID=73223

Here is a small and relatively simple code that can help you in solving this task:

Example

[C#]

/// <summary>

/// Extracts pages from the specified document as a number of separate documents.

/// The pages must be separated with page breaks.

/// We assume that the initial document has only one section.

/// </summary>

private void ExtractPages()

{

      Document doc = new Document(MyPath + “PageSetup.PageBreaks.doc”);

 

      Node startNode = doc.FirstSection.Body.FirstParagraph;

 

      int pageNumber = 1;

 

      while (startNode != null)

      {

            Document page = ExtractPage(doc, ref startNode);

            page.Document.Save(string.Format(MyPath + “PageSetup.PageBreaks.Page{0} Out.doc”, pageNumber++));

      }

}

 

/// <summary>

/// Extracts the part of the document starting from specified node and ending with an end of the document or a page break.

/// Starting node reference is updated after extraction to point on the node next to page break,

/// If the extracted page is the last page of the document, then it is set to null.

/// </summary>

private Document ExtractPage(Document doc, ref Node startNode)

{

      // Only top-level paragraphs/tables can be used as a starting node of a page.

    if (startNode.ParentNode.NodeType != NodeType.Body)

            throw new System.ApplicationException(“Starting node must be a paragraph or a table in the main story of the document.”);

 

      Document page = new Document();

 

      Body pageBody = page.FirstSection.Body;

 

      pageBody.FirstChild.Remove();

 

      NodeImporter importer = new NodeImporter(doc, page, ImportFormatMode.KeepSourceFormatting);

 

      Node node;

 

      for (node = startNode; node != null; node = node.NextSibling)

      {

            page.FirstSection.Body.AppendChild(importer.ImportNode(node, true));

 

            if (node.NodeType == NodeType.Paragraph)

            {

                  Paragraph para = (Paragraph)node;

 

                  if (para.GetText().IndexOf(ControlChar.PageBreakChar) >=0)

                  {

                        node = node.NextSibling;

                        break;

                  }

            }

      }

 

      startNode = node;

 

      Node lastPageNode = pageBody.LastParagraph.LastChild;

 

      if (lastPageNode != null)

            lastPageNode.Remove();

 

      return page;

}