Java实现word文件转换为html

java

背景

考试系统里,用户将试题内容已约定好的格式放到word里,实现批量录入试题存入MongoDB,实现此功能需要解决的问题:

  • word文件内容包含很多格式内容,需要去除掉
  • word里面的图片需要保存,并以链接的形式存入数据库
  • 公式的处理

实现的要点:

  • 将word文件转换为HTML,.doc和.docx需要分别处理
  • 将HTML多余的标签去掉
  • 转化时需要保存图片
  • 公式图片的格式是矢量格式wmf,在web里是无法显示的,所以需要格式转换
实现方法
  • 依赖的库

    pom.xml配置如下:

    <!--word转换为html-->
    <dependency>
        <groupId>org.apache.poi</groupId>
        <artifactId>poi</artifactId>
        <version>3.14</version>
    </dependency>
    <dependency>
        <groupId>org.apache.poi</groupId>
        <artifactId>poi-scratchpad</artifactId>
        <version>3.14</version>
    </dependency>
    <dependency>
        <groupId>org.apache.poi</groupId>
        <artifactId>poi-ooxml</artifactId>
        <version>3.14</version>
    </dependency>
    <dependency>
        <groupId>fr.opensagres.xdocreport</groupId>
        <artifactId>xdocreport</artifactId>
        <version>1.0.6</version>
    </dependency>
    <dependency>
        <groupId>fr.opensagres.xdocreport</groupId>
        <artifactId>org.apache.poi.xwpf.converter.xhtml</artifactId>
        <version>1.0.6</version>
    </dependency>
    <dependency>
        <groupId>org.apache.poi</groupId>
        <artifactId>poi-ooxml-schemas</artifactId>
        <version>3.14</version>
    </dependency>
    <dependency>
        <groupId>org.apache.poi</groupId>
        <artifactId>ooxml-schemas</artifactId>
        <version>1.3</version>
    </dependency>
    <dependency>
        <groupId>fr.opensagres.xdocreport</groupId>
        <artifactId>fr.opensagres.xdocreport.document</artifactId>
        <version>1.0.5</version>
    </dependency>
    <!-- 处理HTML -->
    <dependency>
        <!-- jsoup HTML parser library @ https://jsoup.org/ -->
        <groupId>org.jsoup</groupId>
        <artifactId>jsoup</artifactId>
        <version>1.10.3</version>
    </dependency>
    <!-- 处理公式图片 -->
    <!-- https://mvnrepository.com/artifact/org.freehep/freehep-graphicsio-emf -->
    <dependency>
        <groupId>org.freehep</groupId>
        <artifactId>freehep-graphicsio-emf</artifactId>
        <version>2.4</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/net.arnx/wmf2svg -->
    <dependency>
        <groupId>net.arnx</groupId>
        <artifactId>wmf2svg</artifactId>
        <version>0.9.8</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/com.github.jai-imageio/jai-imageio-core -->
    <dependency>
        <groupId>com.github.jai-imageio</groupId>
        <artifactId>jai-imageio-core</artifactId>
        <version>1.3.0</version>
    </dependency>
    <dependency>
        <groupId>org.apache.xmlgraphics</groupId>
        <artifactId>batik-transcoder</artifactId>
        <version>1.7</version>
    </dependency>
    <dependency>
        <groupId>org.apache.xmlgraphics</groupId>
        <artifactId>batik-codec</artifactId>
        <version>1.7</version>
    </dependency>
    <dependency>
        <groupId>org.apache.xmlgraphics</groupId>
        <artifactId>xmlgraphics-commons</artifactId>
        <version>2.1</version>
    </dependency>
    
  • 代码实现

    package com.ksxing.common.utils;
    
    import com.aspose.imaging.imageloadoptions.MetafileLoadOptions;
    import com.aspose.imaging.imageoptions.PngOptions;
    import com.ksxing.biz.CmodifyBiz;
    import com.ksxing.biz.FileUploadBiz;
    import java.awt.Graphics2D;
    import java.awt.image.BufferedImage;
    import java.io.ByteArrayInputStream;
    import java.io.ByteArrayOutputStream;
    import java.io.File;
    import java.io.FileInputStream;
    import java.io.FileNotFoundException;
    import java.io.FileOutputStream;
    import java.io.IOException;
    import java.io.InputStream;
    import java.io.OutputStream;
    import java.io.OutputStreamWriter;
    import java.io.StringWriter;
    import java.text.SimpleDateFormat;
    import java.util.ArrayList;
    import java.util.Date;
    import java.util.List;
    import java.util.zip.GZIPOutputStream;
    import javax.imageio.ImageIO;
    import javax.imageio.ImageTranscoder;
    import javax.swing.filechooser.FileSystemView;
    import javax.xml.parsers.DocumentBuilderFactory;
    import javax.xml.transform.OutputKeys;
    import javax.xml.transform.Transformer;
    import javax.xml.transform.TransformerFactory;
    import javax.xml.transform.dom.DOMSource;
    import javax.xml.transform.stream.StreamResult;
    import javax.xml.transform.stream.StreamSource;
    import net.arnx.wmf2svg.gdi.svg.SvgGdi;
    import net.arnx.wmf2svg.gdi.wmf.WmfParser;
    import org.apache.batik.transcoder.TranscoderInput;
    import org.apache.batik.transcoder.TranscoderOutput;
    import org.apache.batik.transcoder.image.JPEGTranscoder;
    import org.apache.batik.transcoder.image.PNGTranscoder;
    import org.apache.commons.io.FileUtils;
    import org.apache.poi.hwpf.HWPFDocument;
    import org.apache.poi.hwpf.HWPFDocumentCore;
    import org.apache.poi.hwpf.converter.PicturesManager;
    import org.apache.poi.hwpf.converter.WordToHtmlConverter;
    import org.apache.poi.hwpf.converter.WordToHtmlUtils;
    import org.apache.poi.hwpf.usermodel.PictureType;
    import org.apache.poi.xwpf.converter.core.BasicURIResolver;
    import org.apache.poi.xwpf.converter.core.FileImageExtractor;
    import org.apache.poi.xwpf.converter.xhtml.XHTMLConverter;
    import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions;
    import org.apache.poi.xwpf.usermodel.BodyElementType;
    import org.apache.poi.xwpf.usermodel.IBodyElement;
    import org.apache.poi.xwpf.usermodel.XWPFDocument;
    import org.apache.poi.xwpf.usermodel.XWPFParagraph;
    import org.apache.poi.xwpf.usermodel.XWPFTable;
    import org.apache.poi.xwpf.usermodel.XWPFTableCell;
    import org.apache.poi.xwpf.usermodel.XWPFTableRow;
    import org.apache.xmlgraphics.image.loader.Image;
    import org.freehep.graphicsio.emf.EMFInputStream;
    import org.freehep.graphicsio.emf.EMFRenderer;
    import org.openxmlformats.schemas.officeDocument.x2006.math.CTOMath;
    import org.openxmlformats.schemas.officeDocument.x2006.math.CTOMathPara;
    import org.springframework.beans.factory.annotation.Autowired;
    import org.springframework.stereotype.Service;
    import org.springframework.web.multipart.MultipartFile;
    import org.w3c.dom.Document;
    import org.w3c.dom.Node;
    
    
    /**
     * Created by alvy on 2017/09/15.
     */
    
    @Service
    public class OfficeToolService {
    
      @Autowired
      FileUploadBiz fileUploadBiz;
    
      /**
       * 将docx格式的word文档转换为html字符串 采用apache poi
       *
       * @param docxFile .docx文档
       * @return html字符串
       */
      public String uploadDocxToHtml(MultipartFile docxFile) throws Exception {
        // 将上传的MultipartFIle转换为FileInputStream
        File convFile = new File(docxFile.getOriginalFilename());
        docxFile.transferTo(convFile);
    
        XWPFDocument docxDocument = new XWPFDocument(new FileInputStream(convFile));
        // 配置
        XHTMLOptions options = XHTMLOptions.create();
    
        // 设置图片存储路径
        String firstImagePathStr =
            FileSystemView.getFileSystemView().getHomeDirectory().toString() + "/html_img" + System
                .currentTimeMillis();
        options.setExtractor(new FileImageExtractor(new File(firstImagePathStr)));
        options.URIResolver(new BasicURIResolver(firstImagePathStr));
    
        // 转换html
        ByteArrayOutputStream htmlStream = new ByteArrayOutputStream();
        XHTMLConverter.getInstance().convert(docxDocument, htmlStream, options);
        String htmlStr = htmlStream.toString();
    
        /*
        // 将image文件转换为base64并替换到html字符串里
        String middleImageDirStr = "/word/media";
        String imageDirStr = firstImagePathStr + middleImageDirStr;
        File imageDir = new File(imageDirStr);
        String[] imageList = imageDir.list();
        if (imageList != null) {
          for (int i = 0; i < imageList.length; i++) {
            String oneImagePathStr = imageDirStr + "/" + imageList[i];
            File oneImageFile = new File(oneImagePathStr);
            String imageBase64Str = new String(
                Base64.encodeBase64(FileUtils.readFileToByteArray(oneImageFile)), "UTF-8");
            htmlStr = htmlStr.replace(oneImagePathStr, "data:image/png;base64,"+imageBase64Str);
          }
        }
        */
    
        // 将image文件上传到阿里云并将连接替换到html字符串里
        String middleImageDirStr = "/word/media";
        String imageDirStr = firstImagePathStr + middleImageDirStr;
        File imageDir = new File(imageDirStr);
        String[] imageList = imageDir.list();
        if (imageList != null) {
          for (int i = 0; i < imageList.length; i++) {
            String oneImagePathStr = imageDirStr + "/" + imageList[i];
            InputStream oneImageFile = new FileInputStream(oneImagePathStr);
            String timeStamp = new SimpleDateFormat("yyyyMMddHHmmss").format(new Date());
            String imageName = timeStamp + "-" + imageList[i];
            String aliyunImageUrl = fileUploadBiz.uploadFile(oneImageFile, imageName);
            htmlStr = htmlStr.replace(oneImagePathStr, aliyunImageUrl);
          }
        }
        //删除图片路径
        File firstImagePath = new File(firstImagePathStr);
        FileUtils.deleteDirectory(firstImagePath);
    
        return htmlStr;
      }
    
      /**
       * 将03版本的word转换为html字符串 采用apache poi
       *
       * @param docFile word文档
       * @return 返回html字符串
       */
      public String uploadDocToHtml(MultipartFile docFile) throws Exception {
        // 将上传的MultipartFIle转换为FileInputStream
        File convFile = new File(docFile.getOriginalFilename());
        docFile.transferTo(convFile);
    
        HWPFDocumentCore docDocument = WordToHtmlUtils.loadDoc(convFile);
        WordToHtmlConverter docToHtmlConverter = new WordToHtmlConverter(
            DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());
    
        //图片存储
        final String imagePathStr =
            FileSystemView.getFileSystemView().getHomeDirectory().toString() + "/html_img" + System
                .currentTimeMillis();
        docToHtmlConverter.setPicturesManager(new PicturesManager() {
          public String savePicture(byte[] content, PictureType pictureType, String suggestedName,
              float widthInches,
              float heightInches) {
            File imgPath = new File(imagePathStr);
            if (!imgPath.exists()) {
              imgPath.mkdirs();
            }
            File file = new File(imagePathStr + "/" + suggestedName);
            try {
              OutputStream os = new FileOutputStream(file);
              os.write(content);
              os.close();
            } catch (FileNotFoundException e) {
              e.printStackTrace();
            } catch (IOException e) {
              e.printStackTrace();
            }
            return imagePathStr + "/" + suggestedName;
          }
        });
    
        // 转换html
        docToHtmlConverter.processDocument(docDocument);
        Document htmlDocument = docToHtmlConverter.getDocument();
        ByteArrayOutputStream out = new ByteArrayOutputStream();
        DOMSource domSource = new DOMSource(htmlDocument);
        StreamResult streamResult = new StreamResult(out);
        TransformerFactory tf = TransformerFactory.newInstance();
        Transformer serializer = tf.newTransformer();
        serializer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
        serializer.setOutputProperty(OutputKeys.INDENT, "yes");
        serializer.setOutputProperty(OutputKeys.METHOD, "html");
        serializer.transform(domSource, streamResult);
        out.close();
        String htmlStr = new String(out.toByteArray());
    
        // 将image文件上传到阿里云并将连接替换到html字符串里
        File imageDir = new File(imagePathStr);
        String[] imageList = imageDir.list();
        if (imageList != null) {
          for (int i = 0; i < imageList.length; i++) {
            // 图片路径
            String oneImagePathStr = imagePathStr + "/" + imageList[i];
            // 处理公式emf图片
            String uploadImagePathStr = oneImagePathStr;
            if (oneImagePathStr.endsWith(".emf")) {
              InputStream is = new FileInputStream(oneImagePathStr);
              EMFInputStream eis = new EMFInputStream(is,
                  EMFInputStream.DEFAULT_VERSION);
              EMFRenderer emfRenderer = new EMFRenderer(eis);
              final int width = (int) eis.readHeader().getBounds()
                  .getWidth();
              final int height = (int) eis.readHeader().getBounds()
                  .getHeight();
              // 设置图片的大小和样式
              final BufferedImage result = new BufferedImage(width,
                  height, BufferedImage.TYPE_4BYTE_ABGR);
              Graphics2D g2 = (Graphics2D) result.createGraphics();
              emfRenderer.paint(g2);
              String url = oneImagePathStr.replace(
                  oneImagePathStr.substring(oneImagePathStr.length() - 3), "png");
              File outputfile = new File(url);
              // 写入到磁盘中(格式设置为png背景不会变为橙色)
              ImageIO.write(result, "png", outputfile);
              // 当前的图片写入到磁盘中后,将流关闭
              if (eis != null) {
                eis.close();
                uploadImagePathStr = url;
              }
              if (is != null) {
                is.close();
              }
            }
    //        if (oneImagePathStr.endsWith(".wmf")) {
    //          // 将wmf转svg
    //          //wps公式编辑器里的公式是wmf格式,转换为svg会变形
    //          String svgFile = oneImagePathStr.substring(0,
    //              oneImagePathStr.lastIndexOf(".wmf"))
    //              + ".svg";
    //          wmfToSvg(oneImagePathStr, svgFile);
    //          uploadImagePathStr = svgFile;
    //
    //          // 将svg转png,svg格式也能显示,不许转换为png
    //          String jpgFile = oneImagePathStr.substring(0,
    //              oneImagePathStr.lastIndexOf(".wmf"))
    //              + ".png";
    //          svgToJpg(svgFile, jpgFile);
    //          uploadImagePathStr = jpgFile;
    //
    //        }
    
            // 上传到阿里云并替换路径, 不上传公式文件
            if (!(uploadImagePathStr.endsWith(".wmf") || uploadImagePathStr.endsWith(".emf"))) {
              InputStream oneImageFile = new FileInputStream(uploadImagePathStr);
              String timeStamp = new SimpleDateFormat("yyyyMMddHHmmss").format(new Date());
              String imageName = timeStamp + "-" + uploadImagePathStr
                  .substring(uploadImagePathStr.lastIndexOf("/") + 1, uploadImagePathStr.length());
              String aliyunImageUrl = fileUploadBiz.uploadFile(oneImageFile, imageName);
              htmlStr = htmlStr.replace(oneImagePathStr, aliyunImageUrl);
            }
          }
        }
        //删除生成的图片路径
        File firstImagePath = new File(imagePathStr);
        FileUtils.deleteDirectory(firstImagePath);
    
        return htmlStr;
      }
    
      /**
       * wps公式编辑器生成的公式是wmf格式,转换为svg文件 采用aspose.imaging,jar包去官网下载,然后加入到lib里
       *
       * @param src mwf文件路径
       * @param dest svg文件路径
       */
      public static void wmfToSvg(String src, String dest) {
        // Create an instance of Image class by loading an existing WMF image.
        com.aspose.imaging.Image image = com.aspose.imaging.Image
            .load(src, new com.aspose.imaging.imageloadoptions.MetafileLoadOptions(true));
        try {
          // Create an instance of EmfRasterizationOptions class.
          final com.aspose.imaging.imageoptions.EmfRasterizationOptions options = new com.aspose.imaging.imageoptions.EmfRasterizationOptions();
          options.setPageWidth(image.getWidth());
          options.setPageHeight(image.getHeight());
    
          // Call save method to convert WMF to SVG format by passing output file name and SvgOptions class instance.
          image.save(dest,
              new com.aspose.imaging.imageoptions.SvgOptions() {
                {
                  setVectorRasterizationOptions(options);
                }
              }
          );
        } finally {
          image.dispose();
        }
      }
    
    }