ListParser.java

import java.io.File;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.LinkedList;
import java.util.List;
import java.util.regex.Pattern;

import org.faceless.pdf2.PDF;
import org.faceless.pdf2.PDFParser;
import org.faceless.pdf2.PDFReader;
import org.faceless.pdf2.PageExtractor;
import org.faceless.pdf2.PageExtractor.Text;

public class ListParser {
	public static class Entry {
		public final String name, email, notes;
		public final Calendar arrival, departure;

		Entry(String name, String email, Calendar arrival, Calendar departure,
				String notes) {
			this.name = name;
			this.email = email;
			this.notes = notes;
			this.arrival = arrival;
			this.departure = departure;
		}

		@Override
		public String toString() {
			long timeDiffMillisec = departure.getTimeInMillis()
					- arrival.getTimeInMillis();
			Calendar timeDiff = Calendar.getInstance();
			timeDiff.setTimeInMillis(timeDiffMillisec);
			SimpleDateFormat format = new SimpleDateFormat("YYYY-MM-dd");
			String strToReturn = name + " (" + email + ")  will arrive the ";
			strToReturn += format.format(arrival.getTime())
					+ " and will leave the ";
			strToReturn += format.format(departure.getTime()) + " (stays for ";
			strToReturn += (timeDiff.get(Calendar.DAY_OF_YEAR) - 1)
					+ " nights)";
			strToReturn += ((notes == null) ? "" : ("\n\t notes: " + notes));
			return strToReturn;
		}
	}

	public static void firstTry() throws IOException {
		PDF listing = new PDF(new PDFReader(new File("listing.pdf")));
		PDFParser parser = new PDFParser(listing);
		System.out.println("###########################");
		System.out.println("#    In display order     #");
		System.out.println("###########################");
		PageExtractor pageExtractor = parser.getPageExtractor(0);
		for (Text t : pageExtractor.getTextInDisplayOrder()) {
			System.out.println(t.getText());
		}
	}

	public static void theRealThing() throws IOException {
		PDF listing = new PDF(new PDFReader(new File("listing.pdf")));
		PDFParser parser = new PDFParser(listing);

		List<Entry> inscriptions = new LinkedList<>();
		Pattern emailCatcher = Pattern.compile(
				"^[A-Z0-9._%+-]+@[A-Z0-9.-]+\\.[A-Z]{2,6}$",
				Pattern.CASE_INSENSITIVE);
		PageExtractor pageExtractor = parser.getPageExtractor(0);
		for (Text t : pageExtractor.getMatchingText(emailCatcher)) {
			// each row has a mail address -> easy to spot
			// proceed to the row extraction :
			// a row looks like that : name em@ai.l arrival departure (note)
			// t is pointing on the email bloc, let's extract the text right away. 
			String email = t.getText();
			
			// then let's look for the name bloc
			Text nameHolder = t.getRowPrevious();
			String name = nameHolder.getText();
			
			// now let's access the arrival and departure block: 
			
			Text arrivalHolder = t.getRowNext();
			String arrivalString = arrivalHolder.getText();
			// split arrivalString (e.g. 11/11/2011) to [ 11, 11, 2011]
			String[] arrival = arrivalString.split("/");
			
			Text departureHolder = arrivalHolder.getRowNext();
			String departureString = departureHolder.getText();
			String[] departure = departureString.split("/");
			
			String notes = null;
			Text notesHolder = departureHolder.getRowNext();
			if (notesHolder != null) {
				notes = notesHolder.getText();
			}

			Calendar arrivalDate = Calendar.getInstance();
			arrivalDate.set(Integer.parseInt(arrival[0]),
					Integer.parseInt(arrival[1]), Integer.parseInt(arrival[2]));
			Calendar departureDate = Calendar.getInstance();
			departureDate.set(Integer.parseInt(departure[0]),
					Integer.parseInt(departure[1]),
					Integer.parseInt(departure[2]));

			inscriptions.add(new Entry(name, email, arrivalDate, departureDate,
					notes));
		}

		for (Entry e : inscriptions) {
			System.out.println(e);
		}
	}

	public static void main(String[] args) throws IOException {
		firstTry();
		theRealThing();
	}
}