C#使用iTextSharp将PDF转成文本的方法
本文实例讲述了C#使用iTextSharp将PDF转成文本的方法。分享给大家供大家参考。具体实现方法如下:
usingSystem; usingSystem.IO; usingiTextSharp.text; usingiTextSharp.text.pdf; usingiTextSharp.text.pdf.parser; publicclassParsingPDF{ staticstringPDF; staticstringTEXT2; /** *ParsesthePDFusingPRTokeniser *@paramsrcthepathtotheoriginalPDFfile *@paramdestthepathtotheresultingtextfile */ publicvoidparsePdf(Stringsrc,Stringdest) { PdfReaderreader=newPdfReader(src); StreamWriteroutput=newStreamWriter(newFileStream(dest,FileMode.Create)); intpageCount=reader.NumberOfPages; for(intpg=1;pg<=pageCount;pg++) { //wecaninspectthesyntaxoftheimportedpage byte[]streamBytes=reader.GetPageContent(pg); PRTokenisertokenizer=newPRTokeniser(streamBytes); while(tokenizer.NextToken()) { if(tokenizer.TokenType==PRTokeniser.TokType.STRING) { output.WriteLine(tokenizer.StringValue); } } } output.Flush(); output.Close(); } /** *Mainmethod. */ staticvoidMain(string[]args) { if(args.Length<1||args.Length>2) { Console.WriteLine("USAGE:ParsePDFinfile.pdf<outfile.txt>"); return; } elseif(args.Length==1) { PDF=args[0]; TEXT2=Path.GetFileNameWithoutExtension(PDF)+".txt"; } else { PDF=args[0]; TEXT2=args[1]; } try { DateTimet1=DateTime.Now; ParsingPDFexample=newParsingPDF(); example.parsePdf(PDF,TEXT2); DateTimet2=DateTime.Now; TimeSpants=t2-t1; Console.WriteLine("Parsingcompletedin{0:0.00}seconds.",ts.TotalSeconds); } catch(Exceptionex) { Console.WriteLine("ERROR:"+ex.Message); } }//class publicclassMyTextRenderListener:IRenderListener { /**Theprintwritertowhichtheinformationwillbewritten.*/ protectedStreamWriteroutput; /** *CreatesaRenderListenerthatwilllookfortext. */ publicMyTextRenderListener(StreamWriteroutput) { this.output=output; } publicvoidBeginTextBlock() { output.Write("<"); } publicvoidEndTextBlock() { output.WriteLine(">"); } publicvoidRenderImage(ImageRenderInforenderInfo) { } publicvoidRenderText(TextRenderInforenderInfo) { output.Write("<"); output.Write(renderInfo.GetText()); output.Write(">"); } }//class }//namespace
希望本文所述对大家的C#程序设计有所帮助。