问题描述
小弟我要将有一定格式的word文档解析出来导入数据库,网上查的方法有POI,JACOB,openoffice,JACOB只能运行在windows下,所以放弃,打算用POI,有木有哪位大神做个类似的?求指教。
解决方案
解决方案二:
word文档?做过EXCEL的,还不错
解决方案三:
excel我用poi还不错,word就纠结了
解决方案四:
推荐openoffice,如果是docx的话poi是扛不住的,另doc中的公式poi是很难搞定的,对于docopenffice解析不错,对docx公式需要自己写代码+openoffice能搞定。。。
解决方案五:
我要读取的word文档是一定的格式,都是文本,根据标题和内容分别存入数据库,poi不能实现吗?
解决方案六:
引用4楼u013114085的回复:
我要读取的word文档是一定的格式,都是文本,根据标题和内容分别存入数据库,poi不能实现吗?
都是文本的放doc应该没问题,poi官方说对docx支持不好,现在不知道是什么情况了,我没试过,我想都是文本的话应该可以
解决方案七:
这个就三层结构,大标题,小标题,和内容,根据这三个内容存入不同的字段,能实现吗?
解决方案八:
现在我用的是poi3.9最新版本,2007应该支持的。
解决方案九:
引用7楼u013114085的回复:
现在我用的是poi3.9最新版本,2007应该支持的。
可以。有两种方式:一种是将word转为html,然后解析html。这样有个好处是可以保留word中的样式,目前我用的就是这种。另一种它应该有提取段落文字的方法,不过应该没有样式,我没有试过,不过感觉应该可以。
解决方案十:
既然楼主这么有诚意,我来贴2段代码:读取word当中的内容packagecom.lk.core.util.commons;importjava.io.File;importjava.io.FileInputStream;importjava.io.FileNotFoundException;importjava.io.FileOutputStream;importjava.io.IOException;importjava.util.ArrayList;importjava.util.List;importorg.apache.poi.hwpf.HWPFDocument;importorg.apache.poi.hwpf.usermodel.Picture;importorg.apache.poi.hwpf.usermodel.Range;/***读取word文档的工具类,只支持word2003**@authorzero**/publicclassWordDocUtil{/***文件读取流对象*/privateFileInputStreamfileInputStream=null;/***word文档对象*/privateHWPFDocumentwordDoc=null;/***定义文档中图片的最小大小,因为在读取图片的时候,出现一些非图片的文本框,也被认为是图片被读取出来了*/privatestaticfinalintMIN_PICTURE_CONTEXT=10000;privatevoidsetFileInputStream(FileInputStreamfileInputStream){this.fileInputStream=fileInputStream;}privatevoidsetWordDoc(HWPFDocumentwordDoc){this.wordDoc=wordDoc;}privatevoidinitWordDoc()throwsNullPointerException,IOException{if(fileInputStream==null){thrownewNullPointerException("Fileinputstreamisnull!");}if(wordDoc==null){this.setWordDoc(newHWPFDocument(fileInputStream));}}/***传入word文档路径**@parampath*@throwsFileNotFoundException*/publicWordDocUtil(Stringpath)throwsFileNotFoundException{this.setFileInputStream(newFileInputStream(path));}/***传入文件对象**@paramf*@throwsFileNotFoundException*/publicWordDocUtil(Filef)throwsFileNotFoundException{this.setFileInputStream(newFileInputStream(f));}/***传入文件读取流**@paramis*/publicWordDocUtil(FileInputStreamis){this.setFileInputStream(is);}/***返回word文档中的文字内容,不包含图片等信息**@return*@paramignoreEmpty*是否忽略空内容的行*@throwsIOException*@throwsException*/publicStringgetText(booleanignoreEmpty)throwsNullPointerException,IOException{initWordDoc();intlen=this.getDocParagraphsNum();StringBuffersb=newStringBuffer();StringtmpStr=null;for(inti=0;i<len;i++){tmpStr=this.getParagraphsText(i);if(ignoreEmpty&&!tmpStr.trim().isEmpty()){sb.append(tmpStr);}elseif(!ignoreEmpty){sb.append(tmpStr);}}returnsb.toString();}/***返回文档中的段落数**@throwsIOException*@throwsNullPointerException**@throwsException*/publicintgetDocParagraphsNum()throwsNullPointerException,IOException{initWordDoc();Rangerange=this.wordDoc.getRange();returnrange.numParagraphs();}/***根据段落号获取段落内容,除文档之外的如图片等信息会显示不正确段落号的起始为0**@return*@throwsIOException*@throwsNullPointerException*@throwsException*/publicStringgetParagraphsText(intindex)throwsNullPointerException,IOException{initWordDoc();Rangerange=this.wordDoc.getRange();intlen=range.numParagraphs();if(index>len-1){thrownewIndexOutOfBoundsException("paragraphindexisoutofrange.");}returnrange.getParagraph(index).text().toString();}/***获取文档中的内容,按照段落返回list**@return*@throwsIOException*@throwsNullPointerException*/publicList<String>getDocListText()throwsNullPointerException,IOException{returngetDocListText(null,true);}/***获取文档中的内容,按照段落返回list**@parambeginContext*开始段落的内容,从此段落开始读取数据*@paramneedContext*:是否一定需要返回内容,即如果没有匹配到头,则直接返回文件全内容,保证读取的准确性*@return*@throwsIOException*@throwsNullPointerException*/publicList<String>getDocListText(StringbeginContext,booleanneedContext)throwsNullPointerException,IOException{intlen=getDocParagraphsNum();List<String>context=newArrayList<String>();booleanfindBegin=false;if(beginContext==null){findBegin=true;}for(inti=0;i<len;i++){if(!findBegin&&getParagraphsText(i).trim().equals(beginContext)){findBegin=true;}if(findBegin){context.add(getParagraphsText(i));}}if(context.size()==0&&needContext){for(intj=0;j<len;j++){context.add(getParagraphsText(j));}}returncontext;}/***获取文档中图片的数量**@return*@throwsNullPointerException*@throwsIOException*/publicintgetDocPictureNum()throwsNullPointerException,IOException{initWordDoc();returnthis.wordDoc.getPicturesTable().getAllPictures().size();}/***获取文档中的word图片,写入到fos,图片索引从0开始**@paramfos*@throwsIOException*@throwsNullPointerException*/publicvoidgetDocPicture(FileOutputStreamfos,intindex)throwsNullPointerException,IOException{initWordDoc();List<Picture>pictures=this.wordDoc.getPicturesTable().getAllPictures();if(index>pictures.size()-1){thrownewIndexOutOfBoundsException("pictureindexisoutofrange.");}Picturep=pictures.get(index);p.writeImageContent(fos);fos.close();}/***获取word文档中的图片,可以直接写入流**@throwsIOException*@throwsNullPointerException**/publicbyte[]getDocPicture(intindex)throwsNullPointerException,IOException{initWordDoc();List<Picture>pictures=this.wordDoc.getPicturesTable().getAllPictures();if(index>pictures.size()-1){thrownewIndexOutOfBoundsException("pictureindexisoutofrange.");}Picturep=pictures.get(index);returnp.getContent();}/***获取所有的文档图片,返回context**@throwsIOException*@throwsNullPointerException**/publicList<byte[]>getAllDocPicture()throwsNullPointerException,IOException{initWordDoc();List<Picture>pictures=this.wordDoc.getPicturesTable().getAllPictures();List<byte[]>retList=newArrayList<byte[]>();for(Picturep:pictures){byte[]picContext=p.getContent();if(picContext.length<MIN_PICTURE_CONTEXT){continue;}else{retList.add(picContext);}}//当前出现了图片乱序问题,即读取出来的图片顺序和实际的word里的图片顺序不一致//当前这里默认设置下,第一个图片的size比第二个的大if(retList.size()>=2){if(retList.get(0).length<retList.get(1).length){retList.add(0,retList.get(1));retList.remove(2);}}returnretList;}publicvoidcloseStream(){if(this.fileInputStream!=null){try{this.fileInputStream.close();}catch(IOExceptione){}}}@Overrideprotectedvoidfinalize()throwsThrowable{closeStream();super.finalize();}}
解决方案十一:
转换word为html文件packagecom.lk.core.util.commons;importjava.io.BufferedWriter;importjava.io.File;importjava.io.FileInputStream;importjava.io.FileNotFoundException;importjava.io.FileOutputStream;importjava.io.IOException;importjava.io.OutputStreamWriter;importjava.util.ArrayList;importjava.util.List;importjavax.xml.parsers.DocumentBuilderFactory;importjavax.xml.parsers.ParserConfigurationException;importjavax.xml.transform.OutputKeys;importjavax.xml.transform.Transformer;importjavax.xml.transform.TransformerException;importjavax.xml.transform.TransformerFactory;importjavax.xml.transform.dom.DOMSource;importjavax.xml.transform.stream.StreamResult;importorg.apache.commons.io.output.ByteArrayOutputStream;importorg.apache.poi.hwpf.HWPFDocument;importorg.apache.poi.hwpf.converter.PicturesManager;importorg.apache.poi.hwpf.converter.WordToHtmlConverter;importorg.apache.poi.hwpf.usermodel.Picture;importorg.apache.poi.hwpf.usermodel.PictureType;importorg.w3c.dom.Document;/***利用POI插件,将word转换为htmlCopyright(c)2013byzero.*/publicclassWord2HtmlUtil{//从word中提取的图片存放的文件夹名称,该文件夹和html文件同级privatestaticfinalStringIMAGE_FOLDER="wordImage";/***将内容写入文件*/privatestaticvoidwriteFile(Stringcontent,Stringpath){FileOutputStreamfos=null;BufferedWriterbw=null;try{try{fos=newFileOutputStream(newFile(path));bw=newBufferedWriter(newOutputStreamWriter(fos));bw.write(content);}finally{if(bw!=null){bw.close();}if(fos!=null){fos.close();}}}catch(Exceptione){e.printStackTrace();}}/***创建保存word图片的文件夹*/privatestaticvoidcreateOutPutFolder(StringhtmlFolder){Filefolder=newFile(htmlFolder);if(!folder.exists()||!folder.isDirectory()){folder.mkdirs();}FileimageFolder=newFile(folder.getAbsoluteFile()+File.separator+IMAGE_FOLDER);if(!imageFolder.exists()||!imageFolder.isDirectory()){imageFolder.mkdirs();}}/***将word文档转换为html格式*/publicstaticvoidconvert2Html(StringfileName,StringoutPutFile)throwsTransformerException,IOException,ParserConfigurationException{Stringfolder=newFile(outPutFile).getParent();createOutPutFolder(folder);HWPFDocumentwordDocument=newHWPFDocument(newFileInputStream(fileName));WordToHtmlConverterwordToHtmlConverter=newWordToHtmlConverter(DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());//用于保存word中的图片的名称finalList<String>picNames=newArrayList<String>();wordToHtmlConverter.setPicturesManager(newPicturesManager(){publicStringsavePicture(byte[]content,PictureTypepictureType,StringsuggestedName,floatwidthInches,floatheightInches){//保存图片名称picNames.add(suggestedName);//图片文件存储的相对路径returnIMAGE_FOLDER+File.separator+suggestedName;}});wordToHtmlConverter.processDocument(wordDocument);List<Picture>pics=wordDocument.getPicturesTable().getAllPictures();if(pics!=null){for(inti=0;i<pics.size();i++){Picturepic=pics.get(i);try{pic.writeImageContent(newFileOutputStream(folder+File.separator+IMAGE_FOLDER+File.separator+pic.suggestFullFileName()));}catch(FileNotFoundExceptione){e.printStackTrace();}}}DocumenthtmlDocument=wordToHtmlConverter.getDocument();ByteArrayOutputStreamout=newByteArrayOutputStream();DOMSourcedomSource=newDOMSource(htmlDocument);StreamResultstreamResult=newStreamResult(out);TransformerFactorytf=TransformerFactory.newInstance();Transformerserializer=tf.newTransformer();serializer.setOutputProperty(OutputKeys.ENCODING,"utf-8");serializer.setOutputProperty(OutputKeys.INDENT,"yes");serializer.setOutputProperty(OutputKeys.METHOD,"html");serializer.transform(domSource,streamResult);out.close();writeFile(newString(out.toByteArray()),outPutFile);}}