ListParser.java
import java.io.File; import java.io.IOException; import java.text.SimpleDateFormat; import java.util.Calendar; import java.util.LinkedList; import java.util.List; import java.util.regex.Pattern; import org.faceless.pdf2.PDF; import org.faceless.pdf2.PDFParser; import org.faceless.pdf2.PDFReader; import org.faceless.pdf2.PageExtractor; import org.faceless.pdf2.PageExtractor.Text; public class ListParser { public static class Entry { public final String name, email, notes; public final Calendar arrival, departure; Entry(String name, String email, Calendar arrival, Calendar departure, String notes) { this.name = name; this.email = email; this.notes = notes; this.arrival = arrival; this.departure = departure; } @Override public String toString() { long timeDiffMillisec = departure.getTimeInMillis() - arrival.getTimeInMillis(); Calendar timeDiff = Calendar.getInstance(); timeDiff.setTimeInMillis(timeDiffMillisec); SimpleDateFormat format = new SimpleDateFormat("YYYY-MM-dd"); String strToReturn = name + " (" + email + ") will arrive the "; strToReturn += format.format(arrival.getTime()) + " and will leave the "; strToReturn += format.format(departure.getTime()) + " (stays for "; strToReturn += (timeDiff.get(Calendar.DAY_OF_YEAR) - 1) + " nights)"; strToReturn += ((notes == null) ? "" : ("\n\t notes: " + notes)); return strToReturn; } } public static void firstTry() throws IOException { PDF listing = new PDF(new PDFReader(new File("listing.pdf"))); PDFParser parser = new PDFParser(listing); System.out.println("###########################"); System.out.println("# In display order #"); System.out.println("###########################"); PageExtractor pageExtractor = parser.getPageExtractor(0); for (Text t : pageExtractor.getTextInDisplayOrder()) { System.out.println(t.getText()); } } public static void theRealThing() throws IOException { PDF listing = new PDF(new PDFReader(new File("listing.pdf"))); PDFParser parser = new PDFParser(listing); List<Entry> inscriptions = new LinkedList<>(); Pattern emailCatcher = Pattern.compile( "^[A-Z0-9._%+-]+@[A-Z0-9.-]+\\.[A-Z]{2,6}$", Pattern.CASE_INSENSITIVE); PageExtractor pageExtractor = parser.getPageExtractor(0); for (Text t : pageExtractor.getMatchingText(emailCatcher)) { // each row has a mail address -> easy to spot // proceed to the row extraction : // a row looks like that : name em@ai.l arrival departure (note) // t is pointing on the email bloc, let's extract the text right away. String email = t.getText(); // then let's look for the name bloc Text nameHolder = t.getRowPrevious(); String name = nameHolder.getText(); // now let's access the arrival and departure block: Text arrivalHolder = t.getRowNext(); String arrivalString = arrivalHolder.getText(); // split arrivalString (e.g. 11/11/2011) to [ 11, 11, 2011] String[] arrival = arrivalString.split("/"); Text departureHolder = arrivalHolder.getRowNext(); String departureString = departureHolder.getText(); String[] departure = departureString.split("/"); String notes = null; Text notesHolder = departureHolder.getRowNext(); if (notesHolder != null) { notes = notesHolder.getText(); } Calendar arrivalDate = Calendar.getInstance(); arrivalDate.set(Integer.parseInt(arrival[0]), Integer.parseInt(arrival[1]), Integer.parseInt(arrival[2])); Calendar departureDate = Calendar.getInstance(); departureDate.set(Integer.parseInt(departure[0]), Integer.parseInt(departure[1]), Integer.parseInt(departure[2])); inscriptions.add(new Entry(name, email, arrivalDate, departureDate, notes)); } for (Entry e : inscriptions) { System.out.println(e); } } public static void main(String[] args) throws IOException { firstTry(); theRealThing(); } }