-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Adds a script to detect references. Works on #2.
- Loading branch information
1 parent
43d5ef7
commit ecc121b
Showing
3 changed files
with
183 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
from sqlalchemy import create_engine | ||
from sqlalchemy.orm import sessionmaker, joinedload_all | ||
import database.orm as m | ||
import regex as re | ||
|
||
import reference_helper | ||
|
||
# setup db stuff | ||
Session = sessionmaker() | ||
engine = create_engine('sqlite:///database/test.db') | ||
m.Base.metadata.create_all(engine) | ||
Session.configure(bind=engine) | ||
s = Session() | ||
|
||
def get_all_sections_by_room(): | ||
for room in s.query(m.Room): | ||
def gen(): | ||
for listing in room.listings: | ||
for occupancy in listing.occupancies: | ||
for review in occupancy.reviews: | ||
for section in review.sections: | ||
yield section | ||
|
||
yield room, gen() | ||
|
||
s.query(m.ReviewRoomReference).delete() | ||
|
||
DummyRoom = object() | ||
|
||
def find_room(room, path_str): | ||
# find the common base of the room and path | ||
base = room.parent.path[-len(path_str)] | ||
|
||
# iterate down clusters: | ||
for item in path_str[:-1]: | ||
for child in base.children: | ||
if child.name == item: | ||
base = child | ||
break | ||
else: | ||
return None | ||
|
||
# look in rooms | ||
item = path_str[-1] | ||
for child in base.rooms: | ||
if child.name.replace('*', '') == item: | ||
return child | ||
elif item in child.name.split('/'): | ||
return DummyRoom | ||
elif child.name in item.split('/'): | ||
return DummyRoom | ||
else: | ||
return None | ||
|
||
|
||
for room, sections in get_all_sections_by_room(): | ||
for section in sections: | ||
for path, span in reference_helper.references_in(section.content): | ||
|
||
ref_room = find_room(room, path) | ||
if ref_room is DummyRoom: | ||
continue | ||
|
||
if ref_room is None: | ||
print room, path, '=>', ref_room | ||
continue | ||
|
||
start_idx, end_idx = span | ||
ref = m.ReviewRoomReference( | ||
review_section=section, | ||
start_idx=start_idx, | ||
end_idx=end_idx, | ||
room=ref_room | ||
) | ||
|
||
s.add(ref) | ||
|
||
s.commit() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
import regex as re | ||
import itertools | ||
|
||
room_range = re.compile(r''' | ||
\b(?: | ||
# match the prefix indicating the next number is a room number | ||
(?: | ||
(?:(?i)rooms\ +) | ||
| | ||
(?P<staircase>[A-Z]) | ||
) | ||
(?:(?i) | ||
# match a range of rooms, without all being listed | ||
(?P<range_start>\d+) | ||
\ *(?:-|to)\ * | ||
\g<staircase>? | ||
(?P<range_end>\d+) | ||
| | ||
# match a list of rooms, where only the first is prefixed | ||
(?: | ||
(?P<number>\d+) | ||
(?P<sep>,|\ |/|and|or|&)+ | ||
)* | ||
(?P<number>\d++) | ||
) | ||
| | ||
# match a single room without a staircase. Intentionally special cased | ||
(?!gyp\ )(?:(?i)room)\ +(?P<number>\d++) | ||
)\b | ||
''', | ||
re.V1 | re.X # V1 needed for scoped case insensitivity | ||
) | ||
i = 0 | ||
|
||
def references_in_m(match): | ||
context = match.string[match.span()[0] - 10:match.span()[1] + 10] | ||
staircase = match.captures('staircase') | ||
|
||
if match.captures('number'): | ||
rooms = match.captures('number') | ||
spans = match.spans('number') | ||
|
||
# first match should include staircase letter | ||
if staircase: | ||
s, e = spans[0] | ||
spans[0] = (s - 1, e) | ||
|
||
items = itertools.izip(rooms, spans) | ||
|
||
# special case rooms like N7/8 | ||
if match.group('sep') == '/' and len(rooms) == 2: | ||
yield staircase + ['/'.join(rooms)], (spans[0][0], spans[1][1]) | ||
|
||
if len(rooms) == 1: | ||
yield staircase + rooms, match.span() | ||
|
||
else: | ||
for room_id, span in items: | ||
yield staircase + [room_id], span | ||
else: | ||
start = int(match.group('range_start')) | ||
end = int(match.group('range_end')) | ||
|
||
if start < end: | ||
rooms = map(str, range(start, end + 1)) | ||
else: | ||
rooms = map(str, range(end, start + 1)) | ||
|
||
span = match.span() | ||
|
||
for room_id in rooms: | ||
yield staircase + [room_id], span | ||
|
||
def references_in(s): | ||
for match in room_range.finditer(s): | ||
for ref in references_in_m(match): | ||
yield ref |