OpenXML介紹與應用-保留word特定表格與句子顯示在網頁上(OpenXML keep special(keyword) table and paragraph)
Office Open XML
OpenXML是微軟處理office的一套工具,OpenXML SDK可以讓程式開發人員在程式中使用OpenXML,這篇以一個小應用介紹。今天我想把整份word檔,只留下有關鍵字的句子與表格,並轉換成Html呈現。
紅色那段做讀檔並用OpenXML格式解析
橙色則是把表格中沒出現關鍵字的表格全刪除
黃色要把table的所有句子都加到list,這樣在刪除文具時才不會誤刪到表格內的句子
綠色最後再把非表格且沒出現關鍵字的句子刪除
接著要呈現Html這段是網路找得到的範例程式,就不再贅述,只要把對應的參數胃進去即可。
//呈現html
}
using (MemoryStream memoryStream = new MemoryStream())
{
memoryStream.Write((byte[])"檔案的FileBytes", 0, "檔案的FileBytes長度"]);
//Open and close the WordprocessingML document to make sure
//that the SDK can open it.
using (WordprocessingDocument wDoc = WordprocessingDocument.Open(memoryStream,true))
{
var body = wDoc.MainDocumentPart.Document.Body;
MainDocumentPart MDP = wDoc.MainDocumentPart;
Document Doc = MDP.Document;
//若word文件中Table中沒含特定文字則刪掉整個Table
var table = body.Descendants<DocumentFormat.OpenXml.Wordprocessing.Table>();
table = table.ToList();
for (int i = 0; i < table.Count(); i++)
{
var TableContent = table.ElementAt(i);
if (TableContent.InnerText.Contains(keyword) == false)
{
table.ElementAt(i).Remove();
}
}
//將table每格的句子加到List
List<DocumentFormat.OpenXml.Wordprocessing.TableCell> cellList = new List<DocumentFormat.OpenXml.Wordprocessing.TableCell>();
foreach (DocumentFormat.OpenXml.Wordprocessing.Table t in table)
{
var rows = t.Elements<DocumentFormat.OpenXml.Wordprocessing.TableRow>();
foreach (DocumentFormat.OpenXml.Wordprocessing.TableRow row in rows)
{
var cells = row.Elements<DocumentFormat.OpenXml.Wordprocessing.TableCell>();
foreach (DocumentFormat.OpenXml.Wordprocessing.TableCell cell in cells)
cellList.Add(cell);
}
}
//若word文件中斷亂沒含特定文字則刪除,但若在保留的Table中要留下
foreach (Paragraph ParaDoc in Doc.Descendants<Paragraph>().ToList())
{
bool inTable = false;
for (int i = 0; i < cellList.Count; i++)
{
if ((ParaDoc.InnerText == cellList[i].InnerText && ParaDoc.Parent.ToString().Contains("Table")) || ParaDoc.InnerText == "")
{
inTable = true;
break;
}
}
if ((ParaDoc.InnerText.Contains(keyword) == false || ParaDoc.InnerText.Any(char.IsDigit) == false) && inTable == false)
{
bool a = ParaDoc.InnerText.Contains(keyword);
bool b = ParaDoc.InnerText.Any(char.IsDigit);
ParaDoc.Remove();
}
}
MDP.Document.Save();
Doc.Save();
}
}
紅色那段做讀檔並用OpenXML格式解析
橙色則是把表格中沒出現關鍵字的表格全刪除
黃色要把table的所有句子都加到list,這樣在刪除文具時才不會誤刪到表格內的句子
綠色最後再把非表格且沒出現關鍵字的句子刪除
接著要呈現Html這段是網路找得到的範例程式,就不再贅述,只要把對應的參數胃進去即可。
//呈現html
static string DocxConvertedToHtmlDirectory = "DocxConvertedToHtml/";
protected string Display()
{
byte[] byteArray = (byte[])(Session["ByteArray"]);
if (byteArray != null)
{
try
{
DirectoryInfo convertedDocsDirectory = new DirectoryInfo(Server.MapPath(DocxConvertedToHtmlDirectory));
if (!convertedDocsDirectory.Exists)
convertedDocsDirectory.Create();
Guid g = Guid.NewGuid();
var htmlFileName = g.ToString() + ".html";
return ConvertToHtml(byteArray, convertedDocsDirectory, htmlFileName);
}
catch (Exception ex)
{
//throw;
}
}
else
{
return "Display Error";
}
return "Display Error";
}
//WordML轉Html
public static string ConvertToHtml(byte[] byteArray, DirectoryInfo desDirectory, string htmlFileName)
{
FileInfo fiHtml = new FileInfo(System.IO.Path.Combine(desDirectory.FullName, htmlFileName));
using (MemoryStream memoryStream = new MemoryStream())
{
memoryStream.Write(byteArray, 0, byteArray.Length);
using (WordprocessingDocument wDoc = WordprocessingDocument.Open(memoryStream, true))
{
var imageDirectoryFullName =
fiHtml.FullName.Substring(0, fiHtml.FullName.Length - fiHtml.Extension.Length) + "_files";
var imageDirectoryRelativeName =
fiHtml.Name.Substring(0, fiHtml.Name.Length - fiHtml.Extension.Length) + "_files";
int imageCounter = 0;
var pageTitle = (string)wDoc.CoreFilePropertiesPart.GetXDocument().Descendants(DC.title).FirstOrDefault();
HtmlConverterSettings settings = new HtmlConverterSettings()
{
PageTitle = pageTitle,
FabricateCssClasses = true,
CssClassPrefix = "pt-",
RestrictToSupportedLanguages = false,
RestrictToSupportedNumberingFormats = false,
ImageHandler = imageInfo =>
{
DirectoryInfo localDirInfo = new DirectoryInfo(imageDirectoryFullName);
if (!localDirInfo.Exists)
localDirInfo.Create();
++imageCounter;
string extension = imageInfo.ContentType.Split('/')[1].ToLower();
ImageFormat imageFormat = null;
if (extension == "png")
{
//Convert png to jpeg.
extension = "gif";
imageFormat = ImageFormat.Gif;
}
else if (extension == "gif")
imageFormat = ImageFormat.Gif;
else if (extension == "bmp")
imageFormat = ImageFormat.Bmp;
else if (extension == "jpeg")
imageFormat = ImageFormat.Jpeg;
else if (extension == "tiff")
{
//Convert tiff to gif.
extension = "gif";
imageFormat = ImageFormat.Gif;
}
else if (extension == "x-wmf")
{
extension = "wmf";
imageFormat = ImageFormat.Wmf;
}
// If the image format isn't one that we expect, ignore it,
// and don't return markup for the link.
if (imageFormat == null)
return null;
FileInfo imageFileName = new FileInfo(imageDirectoryFullName + "/image" + imageCounter.ToString() + "." + extension);
try
{
imageInfo.Bitmap.Save(imageFileName.FullName, imageFormat);
}
catch (System.Runtime.InteropServices.ExternalException)
{
return null;
//throw;
}
XElement img = new XElement(Xhtml.img,
new XAttribute(NoNamespace.src, imageDirectoryRelativeName + "/" + imageFileName),
imageInfo.ImgStyleAttribute,
imageInfo.AltText != null ?
new XAttribute(NoNamespace.alt, imageInfo.AltText) : null);
return img;
}
};
XElement html = HtmlConverter.ConvertToHtml(wDoc, settings);
// Note: the xhtml returned by ConvertToHtmlTransform contains objets of type
// XEntity. PtOpenXmlUtil.cs define the XEntity class. See
// http://blogs.msdn.com/ericwhite/archive/2010/01/21/writing-entity-reference-using-linq-to-xml.aspx
// for detailed explanation.
//
// If you further transform the XML tree returned by ConvertToHtmlTransform, you
// must do it correctly, or entities will not be serialized properly.
string htmlString = html.ToString(SaveOptions.DisableFormatting);
return htmlString;
}
}
}
留言
張貼留言