View Javadoc
1   /*
2    * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
3    * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4    *
5    * This code is free software; you can redistribute it and/or modify it
6    * under the terms of the GNU General Public License version 2 only, as
7    * published by the Free Software Foundation.  Oracle designates this
8    * particular file as subject to the "Classpath" exception as provided
9    * by Oracle in the LICENSE file that accompanied this code.
10   *
11   * This code is distributed in the hope that it will be useful, but WITHOUT
12   * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13   * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14   * version 2 for more details (a copy is included in the LICENSE file that
15   * accompanied this code).
16   *
17   * You should have received a copy of the GNU General Public License version
18   * 2 along with this work; if not, write to the Free Software Foundation,
19   * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20   *
21   * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22   * or visit www.oracle.com if you need additional information or have any
23   * questions.
24   */
25  
26  package com.sun.xml.internal.org.jvnet.mimepull;
27  
28  import java.io.InputStream;
29  import java.io.IOException;
30  import java.util.*;
31  import java.util.logging.Logger;
32  import java.nio.ByteBuffer;
33  import java.util.logging.Level;
34  
35  /**
36   * Pull parser for the MIME messages. Applications can use pull API to continue
37   * the parsing MIME messages lazily.
38   *
39   * <pre>
40   * for e.g.:
41   * <p>
42   *
43   * MIMEParser parser = ...
44   * Iterator<MIMEEvent> it = parser.iterator();
45   * while(it.hasNext()) {
46   *   MIMEEvent event = it.next();
47   *   ...
48   * }
49   * </pre>
50   *
51   * @author Jitendra Kotamraju
52   */
53  class MIMEParser implements Iterable<MIMEEvent> {
54  
55      private static final Logger LOGGER = Logger.getLogger(MIMEParser.class.getName());
56  
57      private static final String HEADER_ENCODING = "ISO8859-1";
58  
59      // Actually, the grammar doesn't support whitespace characters
60      // after boundary. But the mail implementation checks for it.
61      // We will only check for these many whitespace characters after boundary
62      private static final int NO_LWSP = 1000;
63      private enum STATE {START_MESSAGE, SKIP_PREAMBLE, START_PART, HEADERS, BODY, END_PART, END_MESSAGE}
64      private STATE state = STATE.START_MESSAGE;
65  
66      private final InputStream in;
67      private final byte[] bndbytes;
68      private final int bl;
69      private final MIMEConfig config;
70      private final int[] bcs = new int[128]; // BnM algo: Bad Character Shift table
71      private final int[] gss;                // BnM algo : Good Suffix Shift table
72  
73      /**
74       * Have we parsed the data from our InputStream yet?
75       */
76      private boolean parsed;
77  
78      /*
79       * Read and process body partsList until we see the
80       * terminating boundary line (or EOF).
81       */
82      private boolean done = false;
83  
84      private boolean eof;
85      private final int capacity;
86      private byte[] buf;
87      private int len;
88      private boolean bol;        // beginning of the line
89  
90      /*
91       * Parses the MIME content. At the EOF, it also closes input stream
92       */
93      MIMEParser(InputStream in, String boundary, MIMEConfig config) {
94          this.in = in;
95          this.bndbytes = getBytes("--"+boundary);
96          bl = bndbytes.length;
97          this.config = config;
98          gss = new int[bl];
99          compileBoundaryPattern();
100 
101         // \r\n + boundary + "--\r\n" + lots of LWSP
102         capacity = config.chunkSize+2+bl+4+NO_LWSP;
103         createBuf(capacity);
104     }
105 
106     /**
107      * Returns iterator for the parsing events. Use the iterator to advance
108      * the parsing.
109      *
110      * @return iterator for parsing events
111      */
112     @Override
113     public Iterator<MIMEEvent> iterator() {
114         return new MIMEEventIterator();
115     }
116 
117     class MIMEEventIterator implements Iterator<MIMEEvent> {
118 
119         @Override
120         public boolean hasNext() {
121             return !parsed;
122         }
123 
124         @Override
125         public MIMEEvent next() {
126             switch(state) {
127                 case START_MESSAGE :
128                     if (LOGGER.isLoggable(Level.FINER)) {LOGGER.log(Level.FINER, "MIMEParser state={0}", STATE.START_MESSAGE);}
129                     state = STATE.SKIP_PREAMBLE;
130                     return MIMEEvent.START_MESSAGE;
131 
132                 case SKIP_PREAMBLE :
133                     if (LOGGER.isLoggable(Level.FINER)) {LOGGER.log(Level.FINER, "MIMEParser state={0}", STATE.SKIP_PREAMBLE);}
134                     skipPreamble();
135                     // fall through
136                 case START_PART :
137                     if (LOGGER.isLoggable(Level.FINER)) {LOGGER.log(Level.FINER, "MIMEParser state={0}", STATE.START_PART);}
138                     state = STATE.HEADERS;
139                     return MIMEEvent.START_PART;
140 
141                 case HEADERS :
142                     if (LOGGER.isLoggable(Level.FINER)) {LOGGER.log(Level.FINER, "MIMEParser state={0}", STATE.HEADERS);}
143                     InternetHeaders ih = readHeaders();
144                     state = STATE.BODY;
145                     bol = true;
146                     return new MIMEEvent.Headers(ih);
147 
148                 case BODY :
149                     if (LOGGER.isLoggable(Level.FINER)) {LOGGER.log(Level.FINER, "MIMEParser state={0}", STATE.BODY);}
150                     ByteBuffer buf = readBody();
151                     bol = false;
152                     return new MIMEEvent.Content(buf);
153 
154                 case END_PART :
155                     if (LOGGER.isLoggable(Level.FINER)) {LOGGER.log(Level.FINER, "MIMEParser state={0}", STATE.END_PART);}
156                     if (done) {
157                         state = STATE.END_MESSAGE;
158                     } else {
159                         state = STATE.START_PART;
160                     }
161                     return MIMEEvent.END_PART;
162 
163                 case END_MESSAGE :
164                     if (LOGGER.isLoggable(Level.FINER)) {LOGGER.log(Level.FINER, "MIMEParser state={0}", STATE.END_MESSAGE);}
165                     parsed = true;
166                     return MIMEEvent.END_MESSAGE;
167 
168                 default :
169                     throw new MIMEParsingException("Unknown Parser state = "+state);
170             }
171         }
172 
173         @Override
174         public void remove() {
175             throw new UnsupportedOperationException();
176         }
177     }
178 
179     /**
180      * Collects the headers for the current part by parsing mesage stream.
181      *
182      * @return headers for the current part
183      */
184     private InternetHeaders readHeaders() {
185         if (!eof) {
186             fillBuf();
187         }
188         return new InternetHeaders(new LineInputStream());
189     }
190 
191     /**
192      * Reads and saves the part of the current attachment part's content.
193      * At the end of this method, buf should have the remaining data
194      * at index 0.
195      *
196      * @return a chunk of the part's content
197      *
198      */
199     private ByteBuffer readBody() {
200         if (!eof) {
201             fillBuf();
202         }
203         int start = match(buf, 0, len);     // matches boundary
204         if (start == -1) {
205             // No boundary is found
206             assert eof || len >= config.chunkSize;
207             int chunkSize = eof ? len : config.chunkSize;
208             if (eof) {
209                 done = true;
210                 throw new MIMEParsingException("Reached EOF, but there is no closing MIME boundary.");
211             }
212             return adjustBuf(chunkSize, len-chunkSize);
213         }
214         // Found boundary.
215         // Is it at the start of a line ?
216         int chunkLen = start;
217         if (bol && start == 0) {
218             // nothing to do
219         } else if (start > 0 && (buf[start-1] == '\n' || buf[start-1] =='\r')) {
220             --chunkLen;
221             if (buf[start-1] == '\n' && start >1 && buf[start-2] == '\r') {
222                 --chunkLen;
223             }
224         } else {
225            return adjustBuf(start+1, len-start-1);  // boundary is not at beginning of a line
226         }
227 
228         if (start+bl+1 < len && buf[start+bl] == '-' && buf[start+bl+1] == '-') {
229             state = STATE.END_PART;
230             done = true;
231             return adjustBuf(chunkLen, 0);
232         }
233 
234         // Consider all the whitespace in boundary+whitespace+"\r\n"
235         int lwsp = 0;
236         for(int i=start+bl; i < len && (buf[i] == ' ' || buf[i] == '\t'); i++) {
237             ++lwsp;
238         }
239 
240         // Check for \n or \r\n in boundary+whitespace+"\n" or boundary+whitespace+"\r\n"
241         if (start+bl+lwsp < len && buf[start+bl+lwsp] == '\n') {
242             state = STATE.END_PART;
243             return adjustBuf(chunkLen, len-start-bl-lwsp-1);
244         } else if (start+bl+lwsp+1 < len && buf[start+bl+lwsp] == '\r' && buf[start+bl+lwsp+1] == '\n') {
245             state = STATE.END_PART;
246             return adjustBuf(chunkLen, len-start-bl-lwsp-2);
247         } else if (start+bl+lwsp+1 < len) {
248             return adjustBuf(chunkLen+1, len-chunkLen-1);       // boundary string in a part data
249         } else if (eof) {
250             done = true;
251             throw new MIMEParsingException("Reached EOF, but there is no closing MIME boundary.");
252         }
253 
254         // Some more data needed to determine if it is indeed a proper boundary
255         return adjustBuf(chunkLen, len-chunkLen);
256     }
257 
258     /**
259      * Returns a chunk from the original buffer. A new buffer is
260      * created with the remaining bytes.
261      *
262      * @param chunkSize create a chunk with these many bytes
263      * @param remaining bytes from the end of the buffer that need to be copied to
264      *        the beginning of the new buffer
265      * @return chunk
266      */
267     private ByteBuffer adjustBuf(int chunkSize, int remaining) {
268         assert buf != null;
269         assert chunkSize >= 0;
270         assert remaining >= 0;
271 
272         byte[] temp = buf;
273         // create a new buf and adjust it without this chunk
274         createBuf(remaining);
275         System.arraycopy(temp, len-remaining, buf, 0, remaining);
276         len = remaining;
277 
278         return ByteBuffer.wrap(temp, 0, chunkSize);
279     }
280 
281     private void createBuf(int min) {
282         buf = new byte[min < capacity ? capacity : min];
283     }
284 
285     /**
286      * Skips the preamble to find the first attachment part
287      */
288     private void skipPreamble() {
289 
290         while(true) {
291             if (!eof) {
292                 fillBuf();
293             }
294             int start = match(buf, 0, len);     // matches boundary
295             if (start == -1) {
296                 // No boundary is found
297                 if (eof) {
298                     throw new MIMEParsingException("Missing start boundary");
299                 } else {
300                     adjustBuf(len-bl+1, bl-1);
301                     continue;
302                 }
303             }
304 
305             if (start > config.chunkSize) {
306                 adjustBuf(start, len-start);
307                 continue;
308             }
309             // Consider all the whitespace boundary+whitespace+"\r\n"
310             int lwsp = 0;
311             for(int i=start+bl; i < len && (buf[i] == ' ' || buf[i] == '\t'); i++) {
312                 ++lwsp;
313             }
314             // Check for \n or \r\n
315             if (start+bl+lwsp < len && (buf[start+bl+lwsp] == '\n' || buf[start+bl+lwsp] == '\r') ) {
316                 if (buf[start+bl+lwsp] == '\n') {
317                     adjustBuf(start+bl+lwsp+1, len-start-bl-lwsp-1);
318                     break;
319                 } else if (start+bl+lwsp+1 < len && buf[start+bl+lwsp+1] == '\n') {
320                     adjustBuf(start+bl+lwsp+2, len-start-bl-lwsp-2);
321                     break;
322                 }
323             }
324             adjustBuf(start+1, len-start-1);
325         }
326         if (LOGGER.isLoggable(Level.FINE)) {LOGGER.log(Level.FINE, "Skipped the preamble. buffer len={0}", len);}
327     }
328 
329     private static byte[] getBytes(String s) {
330         char [] chars= s.toCharArray();
331         int size = chars.length;
332         byte[] bytes = new byte[size];
333 
334         for (int i = 0; i < size;) {
335             bytes[i] = (byte) chars[i++];
336         }
337         return bytes;
338     }
339 
340         /**
341      * Boyer-Moore search method. Copied from java.util.regex.Pattern.java
342      *
343      * Pre calculates arrays needed to generate the bad character
344      * shift and the good suffix shift. Only the last seven bits
345      * are used to see if chars match; This keeps the tables small
346      * and covers the heavily used ASCII range, but occasionally
347      * results in an aliased match for the bad character shift.
348      */
349     private void compileBoundaryPattern() {
350         int i, j;
351 
352         // Precalculate part of the bad character shift
353         // It is a table for where in the pattern each
354         // lower 7-bit value occurs
355         for (i = 0; i < bndbytes.length; i++) {
356             bcs[bndbytes[i]&0x7F] = i + 1;
357         }
358 
359         // Precalculate the good suffix shift
360         // i is the shift amount being considered
361 NEXT:   for (i = bndbytes.length; i > 0; i--) {
362             // j is the beginning index of suffix being considered
363             for (j = bndbytes.length - 1; j >= i; j--) {
364                 // Testing for good suffix
365                 if (bndbytes[j] == bndbytes[j-i]) {
366                     // src[j..len] is a good suffix
367                     gss[j-1] = i;
368                 } else {
369                     // No match. The array has already been
370                     // filled up with correct values before.
371                     continue NEXT;
372                 }
373             }
374             // This fills up the remaining of optoSft
375             // any suffix can not have larger shift amount
376             // then its sub-suffix. Why???
377             while (j > 0) {
378                 gss[--j] = i;
379             }
380         }
381         // Set the guard value because of unicode compression
382         gss[bndbytes.length -1] = 1;
383     }
384 
385     /**
386      * Finds the boundary in the given buffer using Boyer-Moore algo.
387      * Copied from java.util.regex.Pattern.java
388      *
389      * @param mybuf boundary to be searched in this mybuf
390      * @param off start index in mybuf
391      * @param len number of bytes in mybuf
392      *
393      * @return -1 if there is no match or index where the match starts
394      */
395     private int match(byte[] mybuf, int off, int len) {
396         int last = len - bndbytes.length;
397 
398         // Loop over all possible match positions in text
399 NEXT:   while (off <= last) {
400             // Loop over pattern from right to left
401             for (int j = bndbytes.length - 1; j >= 0; j--) {
402                 byte ch = mybuf[off+j];
403                 if (ch != bndbytes[j]) {
404                     // Shift search to the right by the maximum of the
405                     // bad character shift and the good suffix shift
406                     off += Math.max(j + 1 - bcs[ch&0x7F], gss[j]);
407                     continue NEXT;
408                 }
409             }
410             // Entire pattern matched starting at off
411             return off;
412         }
413         return -1;
414     }
415 
416     /**
417      * Fills the remaining buf to the full capacity
418      */
419     private void fillBuf() {
420         if (LOGGER.isLoggable(Level.FINER)) {LOGGER.log(Level.FINER, "Before fillBuf() buffer len={0}", len);}
421         assert !eof;
422         while(len < buf.length) {
423             int read;
424             try {
425                 read = in.read(buf, len, buf.length-len);
426             } catch(IOException ioe) {
427                 throw new MIMEParsingException(ioe);
428             }
429             if (read == -1) {
430                 eof = true;
431                 try {
432                     if (LOGGER.isLoggable(Level.FINE)) {LOGGER.fine("Closing the input stream.");}
433                     in.close();
434                 } catch(IOException ioe) {
435                     throw new MIMEParsingException(ioe);
436                 }
437                 break;
438             } else {
439                 len += read;
440             }
441         }
442         if (LOGGER.isLoggable(Level.FINER)) {LOGGER.log(Level.FINER, "After fillBuf() buffer len={0}", len);}
443     }
444 
445     private void doubleBuf() {
446         byte[] temp = new byte[2*len];
447         System.arraycopy(buf, 0, temp, 0, len);
448         buf = temp;
449         if (!eof) {
450             fillBuf();
451         }
452     }
453 
454     class LineInputStream {
455         private int offset;
456 
457         /*
458          * Read a line containing only ASCII characters from the input
459          * stream. A line is terminated by a CR or NL or CR-NL sequence.
460          * A common error is a CR-CR-NL sequence, which will also terminate
461          * a line.
462          * The line terminator is not returned as part of the returned
463          * String. Returns null if no data is available. <p>
464          *
465          * This class is similar to the deprecated
466          * <code>DataInputStream.readLine()</code>
467          */
468         public String readLine() throws IOException {
469 
470             int hdrLen = 0;
471             int lwsp = 0;
472             while(offset+hdrLen < len) {
473                 if (buf[offset+hdrLen] == '\n') {
474                     lwsp = 1;
475                     break;
476                 }
477                 if (offset+hdrLen+1 == len) {
478                     doubleBuf();
479                 }
480                 if (offset+hdrLen+1 >= len) {   // No more data in the stream
481                     assert eof;
482                     return null;
483                 }
484                 if (buf[offset+hdrLen] == '\r' && buf[offset+hdrLen+1] == '\n') {
485                     lwsp = 2;
486                     break;
487                 }
488                 ++hdrLen;
489             }
490             if (hdrLen == 0) {
491                 adjustBuf(offset+lwsp, len-offset-lwsp);
492                 return null;
493             }
494 
495             String hdr = new String(buf, offset, hdrLen, HEADER_ENCODING);
496             offset += hdrLen+lwsp;
497             return hdr;
498         }
499 
500     }
501 
502 }