Save images from PDF Files using iTextSharp

This code will allow you to go through a PDF Document and extract the images.

1: public void ExtractImagesFromPDF(string sourcePdf, string outputPath)
2: {
3: // NOTE: This will only get the first image it finds per page.
4: PdfReader pdf = new PdfReader(sourcePdf);
5: RandomAccessFileOrArray raf = new iTextSharp.text.pdf.RandomAccessFileOrArray(sourcePdf);
6:
7: try
8: {
9: for (int pageNumber = 1; pageNumber <= pdf.NumberOfPages; pageNumber++)
10: {
11: Response.Write(“Page “ + pageNumber.ToString());
12: PdfDictionary pg = pdf.GetPageN(pageNumber);
13:
14: // recursively search pages, forms and groups for images.
15: PdfObject obj = FindImageInPDFDictionary(pg);
16:
17: if (obj != null)
18: {
19:
20: int XrefIndex = Convert.ToInt32(((PRIndirectReference)obj).Number.ToString(System.Globalization.CultureInfo.InvariantCulture));
21: PdfObject pdfObj = pdf.GetPdfObject(XrefIndex);
22: PdfStream pdfStrem = (PdfStream)pdfObj;
23: byte[] bytes = PdfReader.GetStreamBytesRaw((PRStream)pdfStrem);
24: if ((bytes != null))
25: {
26: using (System.IO.MemoryStream memStream = new System.IO.MemoryStream(bytes))
27: {
28: memStream.Position = 0;
29: System.Drawing.Image img = System.Drawing.Image.FromStream(memStream);
30: // must save the file while stream is open.
31: if (!Directory.Exists(outputPath))
32: Directory.CreateDirectory(outputPath);
33:
34: string path = Path.Combine(outputPath, String.Format(@”{0}.jpg”, pageNumber));
35: System.Drawing.Imaging.EncoderParameters parms = new System.Drawing.Imaging.EncoderParameters(1);
36: parms.Param[0] = new System.Drawing.Imaging.EncoderParameter(System.Drawing.Imaging.Encoder.Compression, 0);
37: ImageCodecInfo jpegEncoder = GetEncoder(ImageFormat.Jpeg);
38: img.Save(path, jpegEncoder, parms);
39: }
40: }
41: }
42: else
43: {
44: Response.Write(“<br />Page “ + pageNumber.ToString() + ” has no images!”);
45: }
46: }
47: }
48: catch
49: {
50: throw;
51: }
52: finally
53: {
54: pdf.Close();
55: raf.Close();
56: }
57:
58:
59: }
60:

This method will allow you to detect an image in a PDF page

61: private PdfObject FindImageInPDFDictionary(PdfDictionary pg)
62: {
63: PdfDictionary res =
64: (PdfDictionary)PdfReader.GetPdfObject(pg.Get(PdfName.RESOURCES));
65:
66:
67: PdfDictionary xobj =
68: (PdfDictionary)PdfReader.GetPdfObject(res.Get(PdfName.XOBJECT));
69: if (xobj != null)
70: {
71: foreach (PdfName name in xobj.Keys)
72: {
73:
74: PdfObject obj = xobj.Get(name);
75: if (obj.IsIndirect())
76: {
77: PdfDictionary tg = (PdfDictionary)PdfReader.GetPdfObject(obj);
78:
79: PdfName type =
80: (PdfName)PdfReader.GetPdfObject(tg.Get(PdfName.SUBTYPE));
81:
82: //image at the root of the pdf
83: if (PdfName.IMAGE.Equals(type))
84: {
85: return obj;
86: }// image inside a form
87: else if (PdfName.FORM.Equals(type))
88: {
89: return FindImageInPDFDictionary(tg);
90: } //image inside a group
91: else if (PdfName.GROUP.Equals(type))
92: {
93: return FindImageInPDFDictionary(tg);
94: }
95:
96: }
97: }
98: }
99:
100: return null;
101:
102: }

If you are using ImageCodecInfo
103:
104: private ImageCodecInfo GetEncoder(ImageFormat format)
105: {
106:
107: ImageCodecInfo[] codecs = ImageCodecInfo.GetImageDecoders();
108:
109: foreach (ImageCodecInfo codec in codecs)
110: {
111: if (codec.FormatID == format.Guid)
112: {
113: return codec;
114: }
115: }
116: return null;
117: }

Advertisements