Skip to content

Commit

Permalink
Adds a script to detect references. Works on #2.
Browse files Browse the repository at this point in the history
  • Loading branch information
eric-wieser committed Oct 1, 2014
1 parent 43d5ef7 commit ecc121b
Show file tree
Hide file tree
Showing 3 changed files with 183 additions and 0 deletions.
27 changes: 27 additions & 0 deletions database/orm.py
Original file line number Diff line number Diff line change
Expand Up @@ -440,6 +440,33 @@ def html_content(self):
return ''.join('<p>' + line.replace('\n', '<br />') + '</p>' for line in self.content.split('\n\n'))


class ReviewRoomReference(Base):
"""
A reference to a room within the text of a review. One piece of text can
refer to multiple rooms. Location of the reference is stored to enable
linking
"""
__tablename__ = prefix + 'review_room_reference'

id = Column(Integer, primary_key=True)

review_id = Column(Integer, ForeignKey(ReviewSection.review_id))
review_heading_id = Column(Integer, ForeignKey(ReviewSection.heading_id))
room_id = Column(Integer, ForeignKey(Room.id))

start_idx = Column(Integer)
end_idx = Column(Integer)

review_section = relationship(
lambda: ReviewSection,
backref=backref('refers_to', order_by=start_idx),
foreign_keys=[review_id, review_heading_id],
primaryjoin=(review_id == ReviewSection.review_id) & (review_heading_id == ReviewSection.heading_id)
)
room = relationship(lambda: Room, backref='references')



#Read: https://research.microsoft.com/pubs/64525/tr-2006-45.pdf
class Photo(Base):
__tablename__ = prefix + 'photos'
Expand Down
78 changes: 78 additions & 0 deletions find_references.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker, joinedload_all
import database.orm as m
import regex as re

import reference_helper

# setup db stuff
Session = sessionmaker()
engine = create_engine('sqlite:///database/test.db')
m.Base.metadata.create_all(engine)
Session.configure(bind=engine)
s = Session()

def get_all_sections_by_room():
for room in s.query(m.Room):
def gen():
for listing in room.listings:
for occupancy in listing.occupancies:
for review in occupancy.reviews:
for section in review.sections:
yield section

yield room, gen()

s.query(m.ReviewRoomReference).delete()

DummyRoom = object()

def find_room(room, path_str):
# find the common base of the room and path
base = room.parent.path[-len(path_str)]

# iterate down clusters:
for item in path_str[:-1]:
for child in base.children:
if child.name == item:
base = child
break
else:
return None

# look in rooms
item = path_str[-1]
for child in base.rooms:
if child.name.replace('*', '') == item:
return child
elif item in child.name.split('/'):
return DummyRoom
elif child.name in item.split('/'):
return DummyRoom
else:
return None


for room, sections in get_all_sections_by_room():
for section in sections:
for path, span in reference_helper.references_in(section.content):

ref_room = find_room(room, path)
if ref_room is DummyRoom:
continue

if ref_room is None:
print room, path, '=>', ref_room
continue

start_idx, end_idx = span
ref = m.ReviewRoomReference(
review_section=section,
start_idx=start_idx,
end_idx=end_idx,
room=ref_room
)

s.add(ref)

s.commit()
78 changes: 78 additions & 0 deletions reference_helper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import regex as re
import itertools

room_range = re.compile(r'''
\b(?:
# match the prefix indicating the next number is a room number
(?:
(?:(?i)rooms\ +)
|
(?P<staircase>[A-Z])
)
(?:(?i)
# match a range of rooms, without all being listed
(?P<range_start>\d+)
\ *(?:-|to)\ *
\g<staircase>?
(?P<range_end>\d+)
|
# match a list of rooms, where only the first is prefixed
(?:
(?P<number>\d+)
(?P<sep>,|\ |/|and|or|&)+
)*
(?P<number>\d++)
)
|
# match a single room without a staircase. Intentionally special cased
(?!gyp\ )(?:(?i)room)\ +(?P<number>\d++)
)\b
''',
re.V1 | re.X # V1 needed for scoped case insensitivity
)
i = 0

def references_in_m(match):
context = match.string[match.span()[0] - 10:match.span()[1] + 10]
staircase = match.captures('staircase')

if match.captures('number'):
rooms = match.captures('number')
spans = match.spans('number')

# first match should include staircase letter
if staircase:
s, e = spans[0]
spans[0] = (s - 1, e)

items = itertools.izip(rooms, spans)

# special case rooms like N7/8
if match.group('sep') == '/' and len(rooms) == 2:
yield staircase + ['/'.join(rooms)], (spans[0][0], spans[1][1])

if len(rooms) == 1:
yield staircase + rooms, match.span()

else:
for room_id, span in items:
yield staircase + [room_id], span
else:
start = int(match.group('range_start'))
end = int(match.group('range_end'))

if start < end:
rooms = map(str, range(start, end + 1))
else:
rooms = map(str, range(end, start + 1))

span = match.span()

for room_id in rooms:
yield staircase + [room_id], span

def references_in(s):
for match in room_range.finditer(s):
for ref in references_in_m(match):
yield ref

0 comments on commit ecc121b

Please sign in to comment.