Adds a script to detect references. Works on #2.

eric-wieser · Oct 1, 2014 · ecc121b · ecc121b
1 parent 43d5ef7
commit ecc121b
Show file tree

Hide file tree

Showing 3 changed files with 183 additions and 0 deletions.
diff --git a/database/orm.py b/database/orm.py
@@ -440,6 +440,33 @@ def html_content(self):
 		return ''.join('<p>' + line.replace('\n', '<br />') + '</p>' for line in self.content.split('\n\n'))
 
 
+class ReviewRoomReference(Base):
+	"""
+	A reference to a room within the text of a review. One piece of text can
+	refer to multiple rooms. Location of the reference is stored to enable
+	linking
+	"""
+	__tablename__ = prefix + 'review_room_reference'
+
+	id = Column(Integer, primary_key=True)
+
+	review_id = Column(Integer, ForeignKey(ReviewSection.review_id))
+	review_heading_id = Column(Integer, ForeignKey(ReviewSection.heading_id))
+	room_id = Column(Integer, ForeignKey(Room.id))
+
+	start_idx = Column(Integer)
+	end_idx = Column(Integer)
+
+	review_section = relationship(
+		lambda: ReviewSection,
+		backref=backref('refers_to', order_by=start_idx),
+		foreign_keys=[review_id, review_heading_id],
+		primaryjoin=(review_id == ReviewSection.review_id) & (review_heading_id == ReviewSection.heading_id)
+	)
+	room = relationship(lambda: Room, backref='references')
+
+
+
 #Read: https://research.microsoft.com/pubs/64525/tr-2006-45.pdf
 class Photo(Base):
 	__tablename__ = prefix + 'photos'

diff --git a/find_references.py b/find_references.py
@@ -0,0 +1,78 @@
+from sqlalchemy import create_engine
+from sqlalchemy.orm import sessionmaker, joinedload_all
+import database.orm as m
+import regex as re
+
+import reference_helper
+
+# setup db stuff
+Session = sessionmaker()
+engine = create_engine('sqlite:///database/test.db')
+m.Base.metadata.create_all(engine)
+Session.configure(bind=engine)
+s = Session()
+
+def get_all_sections_by_room():
+	for room in s.query(m.Room):
+		def gen():
+			for listing in room.listings:
+				for occupancy in listing.occupancies:
+					for review in occupancy.reviews:
+						for section in review.sections:
+							yield section
+
+		yield room, gen()
+
+s.query(m.ReviewRoomReference).delete()
+
+DummyRoom = object()
+
+def find_room(room, path_str):
+	# find the common base of the room and path
+	base = room.parent.path[-len(path_str)]
+
+	# iterate down clusters:
+	for item in path_str[:-1]:
+		for child in base.children:
+			if child.name == item:
+				base = child
+				break
+		else:
+			return None
+
+	# look in rooms
+	item = path_str[-1]
+	for child in base.rooms:
+		if child.name.replace('*', '') == item:
+			return child
+		elif item in child.name.split('/'):
+			return DummyRoom
+		elif child.name in item.split('/'):
+			return DummyRoom
+	else:
+		return None
+
+
+for room, sections in get_all_sections_by_room():
+	for section in sections:
+		for path, span in reference_helper.references_in(section.content):
+
+			ref_room = find_room(room, path)
+			if ref_room is DummyRoom:
+				continue
+
+			if ref_room is None:
+				print room, path, '=>', ref_room
+				continue
+
+			start_idx, end_idx = span
+			ref = m.ReviewRoomReference(
+				review_section=section,
+				start_idx=start_idx,
+				end_idx=end_idx,
+				room=ref_room
+			)
+
+			s.add(ref)
+
+s.commit()
diff --git a/reference_helper.py b/reference_helper.py
@@ -0,0 +1,78 @@
+import regex as re
+import itertools
+
+room_range = re.compile(r'''
+	\b(?:
+		# match the prefix indicating the next number is a room number
+		(?:
+			(?:(?i)rooms\ +)
+			|
+			(?P<staircase>[A-Z])
+		)
+
+		(?:(?i)
+			# match a range of rooms, without all being listed
+			(?P<range_start>\d+)
+			\ *(?:-|to)\ *
+			\g<staircase>?
+			(?P<range_end>\d+)
+			|
+			# match a list of rooms, where only the first is prefixed
+			(?:
+				(?P<number>\d+)
+				(?P<sep>,|\ |/|and|or|&)+
+			)*
+			(?P<number>\d++)
+		)
+	|
+		# match a single room without a staircase. Intentionally special cased
+		(?!gyp\ )(?:(?i)room)\ +(?P<number>\d++)
+	)\b
+	''',
+	re.V1 | re.X # V1 needed for scoped case insensitivity
+)
+i = 0
+
+def references_in_m(match):
+	context = match.string[match.span()[0] - 10:match.span()[1] + 10]
+	staircase = match.captures('staircase')
+
+	if match.captures('number'):
+		rooms = match.captures('number')
+		spans = match.spans('number')
+
+		# first match should include staircase letter
+		if staircase:
+			s, e = spans[0]
+			spans[0] = (s - 1, e)
+
+		items = itertools.izip(rooms, spans)
+
+		# special case rooms like N7/8
+		if match.group('sep') == '/' and len(rooms) == 2:
+			yield staircase + ['/'.join(rooms)], (spans[0][0], spans[1][1])
+
+		if len(rooms) == 1:
+			yield staircase + rooms, match.span()
+
+		else:
+			for room_id, span in items:
+				yield staircase + [room_id], span
+	else:
+		start = int(match.group('range_start'))
+		end = int(match.group('range_end'))
+
+		if start < end:
+			rooms = map(str, range(start, end + 1))
+		else:
+			rooms = map(str, range(end, start + 1))
+
+		span = match.span()
+
+		for room_id in rooms:
+			yield staircase + [room_id], span
+
+def references_in(s):
+	for match in room_range.finditer(s):
+		for ref in references_in_m(match):
+			yield ref