问题描述
最近做一个毕设,关于从pdf文件中提取纯文本的。从网上找了一段C写的代码,可以简单的实现从pdf中抽取文本,不过,只能提取英文,而不能提取汉字。研究了几天了,仍无头绪,急求大侠帮忙。下面附上c语言代码。#include<stdio.h>#include<windows.h>//YOurprojectmustalsoincludezdll.lib(ZLIB)asadependency.//ZLIBcanbefreelydownloadedfromtheinternet,www.zlib.org//Use4bytestructalignmentinyourproject!#include"zlib.h"#include<tchar.h>//Findastringinabuffer:size_tFindStringInBuffer(char*buffer,char*search,size_tbuffersize){char*buffer0=buffer;size_tlen=strlen(search);boolfnd=false;while(!fnd){fnd=true;for(size_ti=0;i<len;i++){if(buffer[i]!=search[i]){fnd=false;break;}}if(fnd)returnbuffer-buffer0;buffer=buffer+1;if(buffer-buffer0+len>=buffersize)return-1;}return-1;}//Keepthismanypreviousrecentcharactersforbackreference:#defineoldchar15//Convertarecentsetofcharactersintoanumberifthereisone.//Otherwisereturn-1:floatExtractNumber(constchar*search,intlastcharoffset){inti=lastcharoffset;while(i>0&&search[i]=='')i--;while(i>0&&(isdigit(search[i])||search[i]=='.'))i--;floatflt=-1.0;charbuffer[oldchar+5];ZeroMemory(buffer,sizeof(buffer));strncpy(buffer,search+i+1,lastcharoffset-i);if(buffer[0]&&sscanf(buffer,"%f",&flt)){returnflt;}return-1.0;}//Checkifacertain2charactertokenjustcamealong(e.g.BT):boolseen2(constchar*search,char*recent){if(recent[oldchar-3]==search[0]&&recent[oldchar-2]==search[1]&&(recent[oldchar-1]==''||recent[oldchar-1]==0x0d||recent[oldchar-1]==0x0a)&&(recent[oldchar-4]==''||recent[oldchar-4]==0x0d||recent[oldchar-4]==0x0a)){returntrue;}returnfalse;}//ThismethodprocessesanuncompressedAdobe(text)objectandextractstext.voidProcessOutput(FILE*file,char*output,size_tlen){//Arewecurrentlyinsideatextobject?//FILE*fText=fopen("c:\output2.txt","a+");//for(size_ti=0;i<len;i++){//fputc(output[i],fText);//}//fclose(fText);boolintextobject=false;//Isthenextcharacterliteral(e.g.\togetacharacteror(toget():boolnextliteral=false;//()Bracketnestinglevel.Textappearsinside()intrbdepth=0;//Keeppreviouscharstogetextractnumbersetc.:charoc[oldchar];intj=0;for(j=0;j<oldchar;j++)oc[j]='';for(size_ti=0;i<len;i++){charc=output[i];if(intextobject){if(rbdepth==0&&seen2("TD",oc)){//Positioning.//Seeifanewlinehastostartorjustatab:floatnum=ExtractNumber(oc,oldchar-5);if(num>1.0){fputc(0x0d,file);fputc(0x0a,file);}if(num<1.0){fputc('t',file);}}if(rbdepth==0&&seen2("ET",oc)){//Endofatextobject,alsogotoanewline.intextobject=false;fputc(0x0d,file);fputc(0x0a,file);}elseif(c=='('&&rbdepth==0&&!nextliteral){//Startoutputtingtext!rbdepth=1;//Seeifaspaceortab(>1000)iscalledforbylooking//atthenumberinfrontof(intnum=ExtractNumber(oc,oldchar-1);if(num>0){if(num>1000.0){fputc('t',file);}elseif(num>100.0){fputc('',file);}}}elseif(c==')'&&rbdepth==1&&!nextliteral){//Stopoutputtingtextrbdepth=0;}elseif(rbdepth==1){//Justanormaltextcharacter:if(c=='\'&&!nextliteral){//Onlyprintoutnextcharacternomatterwhat.Donotinterpret.nextliteral=true;}else{nextliteral=false;if(((c>='')&&(c<='~'))||((c>=128)&&(c<255))){fputc(c,file);}}}}//Storetherecentcharactersforwhenwehavetogobackforanumber:for(j=0;j<oldchar-1;j++)oc[j]=oc[j+1];oc[oldchar-1]=c;if(!intextobject){if(seen2("BT",oc)){//Startofatextobject://printf("存在文字!n");intextobject=true;}}}}int_tmain(intargc,_TCHAR*argv[]){//Discardexistingoutput:FILE*fileo=fopen("c:\pdf\output2.txt","w");if(fileo)fclose(fileo);fileo=fopen("c:\pdf\output2.txt","a");intflag=2;//OpenthePDFsourcefile:FILE*filei=fopen("c:\pdf\123.pdf","rb");if(filei&&fileo){//Getthefilelength:intfseekres=fseek(filei,0,SEEK_END);//fseek==0ifoklongfilelen=ftell(filei);fseekres=fseek(filei,0,SEEK_SET);//Readethentirefileintomemory(!):char*buffer=newchar[filelen];ZeroMemory(buffer,filelen);size_tactualread=fread(buffer,filelen,1,filei);//mustreturn1boolmorestreams=true;//Nowsearchthebufferrepeatedforstreamsofdata:while(morestreams){//Searchforstream,endstream.Weoughttofirstcheckthefilter//oftheobjecttomakesureitifFlateDecode,butskipthatfornow!size_tstreamstart=FindStringInBuffer(buffer,"stream",filelen);size_tstreamend=FindStringInBuffer(buffer,"endstream",filelen);//printf("streamstart=%u,streamend=%un",streamstart,streamend);//test//if(flag==2){//test//flag++;//size_tstringstart=FindStringInBuffer(buffer,"stream",filelen);//for(inttemp=0;temp<6;temp++){//printf("c=%cn",buffer[stringstart+temp]);//}//}if(streamstart>0&&streamend>streamstart){//Skiptobeginningandendofthedatastream:streamstart+=6;if(buffer[streamstart]==0x0d&&buffer[streamstart+1]==0x0a)streamstart+=2;elseif(buffer[streamstart]==0x0a)streamstart++;if(buffer[streamend-2]==0x0d&&buffer[streamend-1]==0x0a)streamend-=2;elseif(buffer[streamend-1]==0x0a)streamend--;//Assumeoutputwillfitinto10timesinputbuffer:size_toutsize=(streamend-streamstart)*10;char*output=newchar[outsize];ZeroMemory(output,outsize);//Nowusezlibtoinflate:z_streamzstrm;ZeroMemory(&zstrm,sizeof(zstrm));zstrm.avail_in=streamend-streamstart+1;zstrm.avail_out=outsize;zstrm.next_in=(Bytef*)(buffer+streamstart);zstrm.next_out=(Bytef*)output;intrsti=inflateInit(&zstrm);if(rsti==Z_OK){intrst2=inflate(&zstrm,Z_FINISH);if(rst2>=0){//Ok,gotsomething,extractthetext:size_ttotout=zstrm.total_out;ProcessOutput(fileo,output,totout);}}delete[]output;output=0;buffer+=streamend+7;filelen=filelen-(streamend+7);}else{morestreams=false;}}fclose(filei);}if(fileo)fclose(fileo);return0;}
解决方案
解决方案二:
请问现在解决了吗???我现在也碰到这问题了!!!
解决方案三:
问题如何解决的??能给点指示吗??小弟也遇到的此问题,急呀!
解决方案四:
楼主好淫,求公布解决方案~
解决方案五:
好吧,我这两天整理一下,将源码及解决过程发一下
解决方案六:
学习一下,完全不会
解决方案七:
楼主,问题解决了吗?小弟也遇到的此问题,能给点提示吗???
解决方案八:
楼主,zlib.h的代码分享一下吧,谢谢了
解决方案九:
求楼主公布解决方式~~~小弟谢谢了
解决方案十:
请问你下楼主解决这个问题了没有啊
解决方案十一:
大神源码何在呀
解决方案十二:
这就相当于程序自动识别验证码一样,唯一的优点是没有图片没有加干扰。。。你需要一个样本图像库来进行识别每个汉字学习下Tesseract,比较初级的识别引擎