#!/usr/bin/python # -*- coding: utf-8 -*- # African American Civil Rights Timeline from Wikipedia # Seminar Information Extraction - WS 2013/2014 # Fraser # This program is a starter program for Python programmers who want to solve the same problem # It performs the complex match at the start of the perl program, and just prints the date range import re import sys import glob ONETOKEN = r"[^ ]+" ONEORTWODIGITS = r"[0-9][0-9]?" OPTIONALSPACE = r" ?" WEIRDDASH = r"–" SPACE = r" " file = "aacrm_text.txt" # here is the first large regular expression from the perl program multiple_entity_string = "("+ONETOKEN+")"+SPACE+"("+ONEORTWODIGITS+")"+OPTIONALSPACE+"("+WEIRDDASH+"|-|to+)"+OPTIONALSPACE+"("+ONETOKEN+")?"+SPACE+"("+ONEORTWODIGITS+")"+SPACE+"("+WEIRDDASH+"|-)"+OPTIONALSPACE # compile the string to a regular expression object multiple_entity = re.compile(multiple_entity_string) lc = 0 for line in open(file, 'r'): lc += 1 if re.match("^