原创

word转html

Maven依赖

<dependency>
<groupId>com.itextpdf</groupId>
<artifactId>itextpdf</artifactId>
<version>5.5.9</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>3.8</version>
</dependency>


import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.PicturesManager;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.usermodel.PictureType;
import org.w3c.dom.Document;

import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.ByteArrayOutputStream;
import java.io.InputStream;

/**
* @author: 木杉
* @email: 819236727@qq.com
* @Date: 2019/7/23 9:01
* @describe:
*/
public class Test {
public static void main(String[] args) throws Exception{
//word的文件流
InputStream inputStream = null;
HWPFDocument wordDocument = new HWPFDocument(inputStream);
WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(
DocumentBuilderFactory.newInstance().newDocumentBuilder()
.newDocument());

//获取word中的图片信息
wordToHtmlConverter.setPicturesManager(new PicturesManager() {
public String savePicture(byte[] content,
PictureType pictureType, String suggestedName,
float widthInches, float heightInches) {
//content 图片的二进制
//pictureType 通过该对象里面的属性getExtension可以获取word图片的类型
//suggestedName 图片的名称
//widthInches 宽 heightInches高

//当转成html时候 <img src=""/> src里面保存的地址
return "http://xxxxxxx.jpg";
}
});

wordToHtmlConverter.processDocument(wordDocument);
Document htmlDocument = wordToHtmlConverter.getDocument();
ByteArrayOutputStream out = new ByteArrayOutputStream();
DOMSource domSource = new DOMSource(htmlDocument);
StreamResult streamResult = new StreamResult(out);

TransformerFactory tf = TransformerFactory.newInstance();
Transformer serializer = tf.newTransformer();
serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
serializer.setOutputProperty(OutputKeys.INDENT, "yes");
serializer.setOutputProperty(OutputKeys.METHOD, "html");
serializer.transform(domSource, streamResult);

//里面即是word转成html的字符串
String html = new String(out.toByteArray());
out.close();
}
}


正文到此结束