背景
考试系统里,用户将试题内容已约定好的格式放到word里,实现批量录入试题存入MongoDB,实现此功能需要解决的问题:
- word文件内容包含很多格式内容,需要去除掉
- word里面的图片需要保存,并以链接的形式存入数据库
- 公式的处理
实现的要点:
- 将word文件转换为HTML,.doc和.docx需要分别处理
- 将HTML多余的标签去掉
- 转化时需要保存图片
- 公式图片的格式是矢量格式wmf,在web里是无法显示的,所以需要格式转换
实现方法
-
依赖的库
pom.xml配置如下:
<!--word转换为html--> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi</artifactId> <version>3.14</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-scratchpad</artifactId> <version>3.14</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-ooxml</artifactId> <version>3.14</version> </dependency> <dependency> <groupId>fr.opensagres.xdocreport</groupId> <artifactId>xdocreport</artifactId> <version>1.0.6</version> </dependency> <dependency> <groupId>fr.opensagres.xdocreport</groupId> <artifactId>org.apache.poi.xwpf.converter.xhtml</artifactId> <version>1.0.6</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-ooxml-schemas</artifactId> <version>3.14</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>ooxml-schemas</artifactId> <version>1.3</version> </dependency> <dependency> <groupId>fr.opensagres.xdocreport</groupId> <artifactId>fr.opensagres.xdocreport.document</artifactId> <version>1.0.5</version> </dependency> <!-- 处理HTML --> <dependency> <!-- jsoup HTML parser library @ https://jsoup.org/ --> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.10.3</version> </dependency> <!-- 处理公式图片 --> <!-- https://mvnrepository.com/artifact/org.freehep/freehep-graphicsio-emf --> <dependency> <groupId>org.freehep</groupId> <artifactId>freehep-graphicsio-emf</artifactId> <version>2.4</version> </dependency> <!-- https://mvnrepository.com/artifact/net.arnx/wmf2svg --> <dependency> <groupId>net.arnx</groupId> <artifactId>wmf2svg</artifactId> <version>0.9.8</version> </dependency> <!-- https://mvnrepository.com/artifact/com.github.jai-imageio/jai-imageio-core --> <dependency> <groupId>com.github.jai-imageio</groupId> <artifactId>jai-imageio-core</artifactId> <version>1.3.0</version> </dependency> <dependency> <groupId>org.apache.xmlgraphics</groupId> <artifactId>batik-transcoder</artifactId> <version>1.7</version> </dependency> <dependency> <groupId>org.apache.xmlgraphics</groupId> <artifactId>batik-codec</artifactId> <version>1.7</version> </dependency> <dependency> <groupId>org.apache.xmlgraphics</groupId> <artifactId>xmlgraphics-commons</artifactId> <version>2.1</version> </dependency>
-
代码实现
package com.ksxing.common.utils; import com.aspose.imaging.imageloadoptions.MetafileLoadOptions; import com.aspose.imaging.imageoptions.PngOptions; import com.ksxing.biz.CmodifyBiz; import com.ksxing.biz.FileUploadBiz; import java.awt.Graphics2D; import java.awt.image.BufferedImage; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.io.OutputStreamWriter; import java.io.StringWriter; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Date; import java.util.List; import java.util.zip.GZIPOutputStream; import javax.imageio.ImageIO; import javax.imageio.ImageTranscoder; import javax.swing.filechooser.FileSystemView; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.transform.OutputKeys; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerFactory; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; import javax.xml.transform.stream.StreamSource; import net.arnx.wmf2svg.gdi.svg.SvgGdi; import net.arnx.wmf2svg.gdi.wmf.WmfParser; import org.apache.batik.transcoder.TranscoderInput; import org.apache.batik.transcoder.TranscoderOutput; import org.apache.batik.transcoder.image.JPEGTranscoder; import org.apache.batik.transcoder.image.PNGTranscoder; import org.apache.commons.io.FileUtils; import org.apache.poi.hwpf.HWPFDocument; import org.apache.poi.hwpf.HWPFDocumentCore; import org.apache.poi.hwpf.converter.PicturesManager; import org.apache.poi.hwpf.converter.WordToHtmlConverter; import org.apache.poi.hwpf.converter.WordToHtmlUtils; import org.apache.poi.hwpf.usermodel.PictureType; import org.apache.poi.xwpf.converter.core.BasicURIResolver; import org.apache.poi.xwpf.converter.core.FileImageExtractor; import org.apache.poi.xwpf.converter.xhtml.XHTMLConverter; import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions; import org.apache.poi.xwpf.usermodel.BodyElementType; import org.apache.poi.xwpf.usermodel.IBodyElement; import org.apache.poi.xwpf.usermodel.XWPFDocument; import org.apache.poi.xwpf.usermodel.XWPFParagraph; import org.apache.poi.xwpf.usermodel.XWPFTable; import org.apache.poi.xwpf.usermodel.XWPFTableCell; import org.apache.poi.xwpf.usermodel.XWPFTableRow; import org.apache.xmlgraphics.image.loader.Image; import org.freehep.graphicsio.emf.EMFInputStream; import org.freehep.graphicsio.emf.EMFRenderer; import org.openxmlformats.schemas.officeDocument.x2006.math.CTOMath; import org.openxmlformats.schemas.officeDocument.x2006.math.CTOMathPara; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Service; import org.springframework.web.multipart.MultipartFile; import org.w3c.dom.Document; import org.w3c.dom.Node; /** * Created by alvy on 2017/09/15. */ @Service public class OfficeToolService { @Autowired FileUploadBiz fileUploadBiz; /** * 将docx格式的word文档转换为html字符串 采用apache poi * * @param docxFile .docx文档 * @return html字符串 */ public String uploadDocxToHtml(MultipartFile docxFile) throws Exception { // 将上传的MultipartFIle转换为FileInputStream File convFile = new File(docxFile.getOriginalFilename()); docxFile.transferTo(convFile); XWPFDocument docxDocument = new XWPFDocument(new FileInputStream(convFile)); // 配置 XHTMLOptions options = XHTMLOptions.create(); // 设置图片存储路径 String firstImagePathStr = FileSystemView.getFileSystemView().getHomeDirectory().toString() + "/html_img" + System .currentTimeMillis(); options.setExtractor(new FileImageExtractor(new File(firstImagePathStr))); options.URIResolver(new BasicURIResolver(firstImagePathStr)); // 转换html ByteArrayOutputStream htmlStream = new ByteArrayOutputStream(); XHTMLConverter.getInstance().convert(docxDocument, htmlStream, options); String htmlStr = htmlStream.toString(); /* // 将image文件转换为base64并替换到html字符串里 String middleImageDirStr = "/word/media"; String imageDirStr = firstImagePathStr + middleImageDirStr; File imageDir = new File(imageDirStr); String[] imageList = imageDir.list(); if (imageList != null) { for (int i = 0; i < imageList.length; i++) { String oneImagePathStr = imageDirStr + "/" + imageList[i]; File oneImageFile = new File(oneImagePathStr); String imageBase64Str = new String( Base64.encodeBase64(FileUtils.readFileToByteArray(oneImageFile)), "UTF-8"); htmlStr = htmlStr.replace(oneImagePathStr, "data:image/png;base64,"+imageBase64Str); } } */ // 将image文件上传到阿里云并将连接替换到html字符串里 String middleImageDirStr = "/word/media"; String imageDirStr = firstImagePathStr + middleImageDirStr; File imageDir = new File(imageDirStr); String[] imageList = imageDir.list(); if (imageList != null) { for (int i = 0; i < imageList.length; i++) { String oneImagePathStr = imageDirStr + "/" + imageList[i]; InputStream oneImageFile = new FileInputStream(oneImagePathStr); String timeStamp = new SimpleDateFormat("yyyyMMddHHmmss").format(new Date()); String imageName = timeStamp + "-" + imageList[i]; String aliyunImageUrl = fileUploadBiz.uploadFile(oneImageFile, imageName); htmlStr = htmlStr.replace(oneImagePathStr, aliyunImageUrl); } } //删除图片路径 File firstImagePath = new File(firstImagePathStr); FileUtils.deleteDirectory(firstImagePath); return htmlStr; } /** * 将03版本的word转换为html字符串 采用apache poi * * @param docFile word文档 * @return 返回html字符串 */ public String uploadDocToHtml(MultipartFile docFile) throws Exception { // 将上传的MultipartFIle转换为FileInputStream File convFile = new File(docFile.getOriginalFilename()); docFile.transferTo(convFile); HWPFDocumentCore docDocument = WordToHtmlUtils.loadDoc(convFile); WordToHtmlConverter docToHtmlConverter = new WordToHtmlConverter( DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument()); //图片存储 final String imagePathStr = FileSystemView.getFileSystemView().getHomeDirectory().toString() + "/html_img" + System .currentTimeMillis(); docToHtmlConverter.setPicturesManager(new PicturesManager() { public String savePicture(byte[] content, PictureType pictureType, String suggestedName, float widthInches, float heightInches) { File imgPath = new File(imagePathStr); if (!imgPath.exists()) { imgPath.mkdirs(); } File file = new File(imagePathStr + "/" + suggestedName); try { OutputStream os = new FileOutputStream(file); os.write(content); os.close(); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } return imagePathStr + "/" + suggestedName; } }); // 转换html docToHtmlConverter.processDocument(docDocument); Document htmlDocument = docToHtmlConverter.getDocument(); ByteArrayOutputStream out = new ByteArrayOutputStream(); DOMSource domSource = new DOMSource(htmlDocument); StreamResult streamResult = new StreamResult(out); TransformerFactory tf = TransformerFactory.newInstance(); Transformer serializer = tf.newTransformer(); serializer.setOutputProperty(OutputKeys.ENCODING, "UTF-8"); serializer.setOutputProperty(OutputKeys.INDENT, "yes"); serializer.setOutputProperty(OutputKeys.METHOD, "html"); serializer.transform(domSource, streamResult); out.close(); String htmlStr = new String(out.toByteArray()); // 将image文件上传到阿里云并将连接替换到html字符串里 File imageDir = new File(imagePathStr); String[] imageList = imageDir.list(); if (imageList != null) { for (int i = 0; i < imageList.length; i++) { // 图片路径 String oneImagePathStr = imagePathStr + "/" + imageList[i]; // 处理公式emf图片 String uploadImagePathStr = oneImagePathStr; if (oneImagePathStr.endsWith(".emf")) { InputStream is = new FileInputStream(oneImagePathStr); EMFInputStream eis = new EMFInputStream(is, EMFInputStream.DEFAULT_VERSION); EMFRenderer emfRenderer = new EMFRenderer(eis); final int width = (int) eis.readHeader().getBounds() .getWidth(); final int height = (int) eis.readHeader().getBounds() .getHeight(); // 设置图片的大小和样式 final BufferedImage result = new BufferedImage(width, height, BufferedImage.TYPE_4BYTE_ABGR); Graphics2D g2 = (Graphics2D) result.createGraphics(); emfRenderer.paint(g2); String url = oneImagePathStr.replace( oneImagePathStr.substring(oneImagePathStr.length() - 3), "png"); File outputfile = new File(url); // 写入到磁盘中(格式设置为png背景不会变为橙色) ImageIO.write(result, "png", outputfile); // 当前的图片写入到磁盘中后,将流关闭 if (eis != null) { eis.close(); uploadImagePathStr = url; } if (is != null) { is.close(); } } // if (oneImagePathStr.endsWith(".wmf")) { // // 将wmf转svg // //wps公式编辑器里的公式是wmf格式,转换为svg会变形 // String svgFile = oneImagePathStr.substring(0, // oneImagePathStr.lastIndexOf(".wmf")) // + ".svg"; // wmfToSvg(oneImagePathStr, svgFile); // uploadImagePathStr = svgFile; // // // 将svg转png,svg格式也能显示,不许转换为png // String jpgFile = oneImagePathStr.substring(0, // oneImagePathStr.lastIndexOf(".wmf")) // + ".png"; // svgToJpg(svgFile, jpgFile); // uploadImagePathStr = jpgFile; // // } // 上传到阿里云并替换路径, 不上传公式文件 if (!(uploadImagePathStr.endsWith(".wmf") || uploadImagePathStr.endsWith(".emf"))) { InputStream oneImageFile = new FileInputStream(uploadImagePathStr); String timeStamp = new SimpleDateFormat("yyyyMMddHHmmss").format(new Date()); String imageName = timeStamp + "-" + uploadImagePathStr .substring(uploadImagePathStr.lastIndexOf("/") + 1, uploadImagePathStr.length()); String aliyunImageUrl = fileUploadBiz.uploadFile(oneImageFile, imageName); htmlStr = htmlStr.replace(oneImagePathStr, aliyunImageUrl); } } } //删除生成的图片路径 File firstImagePath = new File(imagePathStr); FileUtils.deleteDirectory(firstImagePath); return htmlStr; } /** * wps公式编辑器生成的公式是wmf格式,转换为svg文件 采用aspose.imaging,jar包去官网下载,然后加入到lib里 * * @param src mwf文件路径 * @param dest svg文件路径 */ public static void wmfToSvg(String src, String dest) { // Create an instance of Image class by loading an existing WMF image. com.aspose.imaging.Image image = com.aspose.imaging.Image .load(src, new com.aspose.imaging.imageloadoptions.MetafileLoadOptions(true)); try { // Create an instance of EmfRasterizationOptions class. final com.aspose.imaging.imageoptions.EmfRasterizationOptions options = new com.aspose.imaging.imageoptions.EmfRasterizationOptions(); options.setPageWidth(image.getWidth()); options.setPageHeight(image.getHeight()); // Call save method to convert WMF to SVG format by passing output file name and SvgOptions class instance. image.save(dest, new com.aspose.imaging.imageoptions.SvgOptions() { { setVectorRasterizationOptions(options); } } ); } finally { image.dispose(); } } }