我在使用 OpenXML (C#) 解析 *.docx 文档时遇到一个问题。
所以,这是我的步骤:
1.加载*.docx文档
2. 接收段落列表
3. 在每个段落中查找文本、图像和表格元素
4. 为每个文本和图像元素创建 html 标签
5. 将输出保存为 *.html 文件
我已经找到如何在文档中找到图像文件并提取它。
现在还有一步要做 - 找到表格在文本(段落)中的位置。
如果有人知道如何使用 OpenXML 查找 *.docx 文档中的表格,请提供帮助。
谢谢。
额外的:
好吧,可能是我不清楚解释我的意思。
如果我们获取段落内容,您可以找到文本块、图片等Child对象。
因此,如果段落包含包含图片的“运行”,则意味着在 Word 文档中的这个位置放置了图像。
我的函数示例:
public static string ParseDocxDocument(string pathToFile)
{
StringBuilder result = new StringBuilder();
WordprocessingDocument wordProcessingDoc = WordprocessingDocument.Open(pathToFile, true);
List<ImagePart> imgPart = wordProcessingDoc.MainDocumentPart.ImageParts.ToList();
IEnumerable<Paragraph> paragraphElement = wordProcessingDoc.MainDocumentPart.Document.Descendants<Paragraph>();
int imgCounter = 0;
foreach (Paragraph par in paragraphElement)
{
//Add new paragraph tag
result.Append("<div style=\"width:100%; text-align:");
//Append anchor style
if (par.ParagraphProperties != null && par.ParagraphProperties.Justification != null)
switch (par.ParagraphProperties.Justification.Val.Value)
{
case JustificationValues.Left:
result.Append("left;");
break;
case JustificationValues.Center:
result.Append("center;");
break;
case JustificationValues.Both:
result.Append("justify;");
break;
case JustificationValues.Right:
default:
result.Append("right;");
break;
}
else
result.Append("left;");
//Append text decoration style
if (par.ParagraphProperties != null && par.ParagraphProperties.ParagraphMarkRunProperties != null && par.ParagraphProperties.ParagraphMarkRunProperties.HasChildren)
foreach (OpenXmlElement chield in par.ParagraphProperties.ParagraphMarkRunProperties.ChildElements)
{
switch (chield.GetType().Name)
{
case "Bold":
result.Append("font-weight:bold;");
break;
case "Underline":
result.Append("text-decoration:underline;");
break;
case "Italic":
result.Append("font-style:italic;");
break;
case "FontSize":
result.Append("font-size:" + ((FontSize)chield).Val.Value + "px;");
break;
default: break;
}
}
result.Append("\">");
//Add image tag
IEnumerable<Run> runs = par.Descendants<Run>();
foreach (Run run in runs)
{
if (run.HasChildren)
{
foreach (OpenXmlElement chield in run.ChildElements.Where(o => o.GetType().Name == "Picture"))
{
result.Append(string.Format("<img style=\"{1}\" src=\"data:image/jpeg;base64,{0}\" />", GetBase64Image(imgPart[imgCounter].GetStream()),
((DocumentFormat.OpenXml.Vml.Shape)chield.ChildElements.Where(o => o.GetType().Name == "Shape").FirstOrDefault()).Style
));
imgCounter++;
}
}
}
//Append inner text
IEnumerable<Text> textElement = par.Descendants<Text>();
if (par.Descendants<Text>().Count() == 0)
result.Append("<br />");
foreach (Text t in textElement)
{
result.Append(t.Text);
}
result.Append("</div>");
result.Append(Environment.NewLine);
}
wordProcessingDoc.Close();
return result.ToString();
}
现在我想在文本中指定表格位置(如 Word 中所示)。
Final:
好吧,大家,我已经知道了。在我的示例函数中,有一个很大的错误。我枚举文档正文的段落元素。表格与段落处于同一级别,因此函数忽略表格。所以我们需要枚举文档主体的元素。
这是我的测试函数,用于从 docx 生成正确的 HTML(这只是测试代码,所以它不干净)
public static string ParseDocxDocument(string pathToFile)
{
StringBuilder result = new StringBuilder();
WordprocessingDocument wordProcessingDoc = WordprocessingDocument.Open(pathToFile, true);
List<ImagePart> imgPart = wordProcessingDoc.MainDocumentPart.ImageParts.ToList();
List<string> tableCellContent = new List<string>();
IEnumerable<Paragraph> paragraphElement = wordProcessingDoc.MainDocumentPart.Document.Descendants<Paragraph>();
int imgCounter = 0;
foreach (OpenXmlElement section in wordProcessingDoc.MainDocumentPart.Document.Body.Elements<OpenXmlElement>())
{
if(section.GetType().Name == "Paragraph")
{
Paragraph par = (Paragraph)section;
//Add new paragraph tag
result.Append("<div style=\"width:100%; text-align:");
//Append anchor style
if (par.ParagraphProperties != null && par.ParagraphProperties.Justification != null)
switch (par.ParagraphProperties.Justification.Val.Value)
{
case JustificationValues.Left:
result.Append("left;");
break;
case JustificationValues.Center:
result.Append("center;");
break;
case JustificationValues.Both:
result.Append("justify;");
break;
case JustificationValues.Right:
default:
result.Append("right;");
break;
}
else
result.Append("left;");
//Append text decoration style
if (par.ParagraphProperties != null && par.ParagraphProperties.ParagraphMarkRunProperties != null && par.ParagraphProperties.ParagraphMarkRunProperties.HasChildren)
foreach (OpenXmlElement chield in par.ParagraphProperties.ParagraphMarkRunProperties.ChildElements)
{
switch (chield.GetType().Name)
{
case "Bold":
result.Append("font-weight:bold;");
break;
case "Underline":
result.Append("text-decoration:underline;");
break;
case "Italic":
result.Append("font-style:italic;");
break;
case "FontSize":
result.Append("font-size:" + ((FontSize)chield).Val.Value + "px;");
break;
default: break;
}
}
result.Append("\">");
//Add image tag
IEnumerable<Run> runs = par.Descendants<Run>();
foreach (Run run in runs)
{
if (run.HasChildren)
{
foreach (OpenXmlElement chield in run.ChildElements.Where(o => o.GetType().Name == "Picture"))
{
result.Append(string.Format("<img style=\"{1}\" src=\"data:image/jpeg;base64,{0}\" />", GetBase64Image(imgPart[imgCounter].GetStream()),
((DocumentFormat.OpenXml.Vml.Shape)chield.ChildElements.Where(o => o.GetType().Name == "Shape").FirstOrDefault()).Style
));
imgCounter++;
}
foreach (OpenXmlElement table in run.ChildElements.Where(o => o.GetType().Name == "Table"))
{
result.Append("<strong>HERE'S TABLE</strong>");
}
}
}
//Append inner text
IEnumerable<Text> textElement = par.Descendants<Text>();
if (par.Descendants<Text>().Count() == 0)
result.Append("<br />");
foreach (Text t in textElement.Where(o=>!tableCellContent.Contains(o.Text.Trim())))
{
result.Append(t.Text);
}
result.Append("</div>");
result.Append(Environment.NewLine);
}
else if (section.GetType().Name=="Table")
{
result.Append("<table>");
Table tab = (Table)section;
foreach (TableRow row in tab.Descendants<TableRow>())
{
result.Append("<tr>");
foreach (TableCell cell in row.Descendants<TableCell>())
{
result.Append("<td>");
result.Append(cell.InnerText);
tableCellContent.Add(cell.InnerText.Trim());
result.Append("</td>");
}
result.Append("</tr>");
}
result.Append("</table>");
}
}
wordProcessingDoc.Close();
return result.ToString();
}
private static string GetBase64Image(Stream inputData)
{
byte[] data = new byte[inputData.Length];
inputData.Read(data, 0, data.Length);
return Convert.ToBase64String(data);
}