Skip to content

Commit

Permalink
fix: changes due to metadata name changes
Browse files Browse the repository at this point in the history
  • Loading branch information
pierreaubert committed Nov 30, 2023
1 parent 077f3d5 commit 46d468e
Show file tree
Hide file tree
Showing 11 changed files with 191 additions and 128 deletions.
18 changes: 18 additions & 0 deletions generate_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import logging
import os
import pathlib
import re
import sys
import warnings

Expand All @@ -33,6 +34,7 @@
import datas.metadata as metadata

from spinorama import ray_setup_logger
import spinorama.constant_paths as cpaths

MINIRAY = None
try:
Expand Down Expand Up @@ -356,3 +358,19 @@ def sort_meta_score(s):
reverse=True,
)
return {k: meta[k] for k in keys_sorted_score}


def find_metadata_file():
pattern = "{}-[0-9a-f]*.json".format(cpaths.CPATH_METADATA_JSON[:-5])
json_filenames = glob(pattern)
# print('DEBUG: {}'.format(json_filenames))
json_filename = None
for json_maybe in json_filenames:
check = re.match(".*[-][0-9a-f]{5}[.]json$", json_maybe)
if check is not None:
json_filename = json_maybe
break
if json_filename is not None and os.path.exists(json_filename):
return json_filename

return None
8 changes: 4 additions & 4 deletions generate_eq_compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
from docopt import docopt
import numpy as np

from generate_common import get_custom_logger, args2level
from generate_common import get_custom_logger, args2level, find_metadata_file
from spinorama.constant_paths import CPATH_METADATA_JSON, CPATH_DOCS_SPEAKERS, CPATH_DATAS_EQ
from spinorama.need_update import need_update
from spinorama.pict import write_multiformat
Expand Down Expand Up @@ -71,9 +71,9 @@ def print_eq_compare(data, force):

def main(force):
# load all metadata from generated json file
json_filename = CPATH_METADATA_JSON
if not os.path.exists(json_filename):
logger.error("Cannot find %s", json_filename)
json_filename = find_metadata_file()
if json_filename is None:
logger.error("Cannot find metadata file!")
sys.exit(1)

jsmeta = None
Expand Down
152 changes: 91 additions & 61 deletions generate_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,9 @@

from datas.metadata import speakers_info as extradata
from generate_common import (
get_custom_logger,
args2level,
get_custom_logger,
find_metadata_file,
sort_metadata_per_score,
sort_metadata_per_date,
)
Expand Down Expand Up @@ -99,55 +100,93 @@ def write_if_different(new_content, filename: str):
]


def generate_measurement(
dataframe,
meta,
site,
use_search,
speaker_name,
origins,
speaker_html,
graph_html,
origin,
measurements,
key,
dfs,
):
logger.debug("generate %s %s %s", speaker_name, origin, key)
freq = {k: dfs[k] for k in FREQ_FILTER if k in dfs}
contour = {k: dfs[k] for k in CONTOUR_FILTER if k in dfs}
radar = {k: dfs[k] for k in RADAR_FILTER if k in dfs}
# eq
eq = None
if key != "default_eq":
eq_filter = [
"ref_vs_eq",
]
eq = {k: dfs[k] for k in eq_filter if k in dfs}
# get index.html filename
dirname = "{}/{}/".format(cpaths.CPATH_DOCS_SPEAKERS, speaker_name)
if origin in ("ASR", "Princeton", "ErinsAudioCorner", "Misc"):
dirname += origin
else:
dirname += meta[speaker_name]["brand"]
index_name = "{0}/index_{1}.html".format(dirname, key)

# write index.html
logger.info("Writing %s for %s", index_name, speaker_name)
speaker_content = speaker_html.render(
speaker=speaker_name,
g_freq=freq,
g_contour=contour,
g_radar=radar,
g_key=key,
g_eq=eq,
meta=meta,
origin=origin,
site=site,
use_search=use_search,
)
write_if_different(speaker_content, index_name)

# write a small file per graph to render the json generated by Vega
for kind in [freq, contour, radar]:
for graph_name in kind:
graph_filename = "{0}/{1}/{2}.html".format(dirname, key, graph_name)
logger.info("Writing %s/%s for %s", key, graph_filename, speaker_name)
graph_content = graph_html.render(
speaker=speaker_name, graph=graph_name, meta=meta, site=site
)
write_if_different(graph_content, graph_filename)


def generate_speaker(
dataframe, meta, site, use_search, speaker_name, origins, speaker_html, graph_html
):
for origin, measurements in origins.items():
for key, dfs in measurements.items():
logger.debug("generate %s %s %s", speaker_name, origin, key)
freq = {k: dfs[k] for k in FREQ_FILTER if k in dfs}
contour = {k: dfs[k] for k in CONTOUR_FILTER if k in dfs}
radar = {k: dfs[k] for k in RADAR_FILTER if k in dfs}
# eq
eq = None
if key != "default_eq":
eq_filter = [
"ref_vs_eq",
]
eq = {k: dfs[k] for k in eq_filter if k in dfs}
# get index.html filename
dirname = cpaths.CPATH_DOCS_SPEAKERS + "/" + speaker_name + "/"
if origin in ("ASR", "Princeton", "ErinsAudioCorner", "Misc"):
dirname += origin
else:
dirname += meta[speaker_name]["brand"]
index_name = "{0}/index_{1}.html".format(dirname, key)

# write index.html
logger.info("Writing %s for %s", index_name, speaker_name)
speaker_content = speaker_html.render(
speaker=speaker_name,
g_freq=freq,
g_contour=contour,
g_radar=radar,
g_key=key,
g_eq=eq,
meta=meta,
origin=origin,
site=site,
use_search=use_search,
)
write_if_different(speaker_content, index_name)

# write a small file per graph to render the json generated by Vega
for kind in [freq, contour, radar]:
for graph_name in kind:
graph_filename = "{0}/{1}/{2}.html".format(dirname, key, graph_name)
logger.info("Writing %s/%s for %s", key, graph_filename, speaker_name)
graph_content = graph_html.render(
speaker=speaker_name, graph=graph_name, meta=meta, site=site
try:
# print('DEBUG: '+speaker_name+' origin='+origin+' version='+key)
generate_measurement(
dataframe,
meta,
site,
use_search,
speaker_name,
origins,
speaker_html,
graph_html,
origin,
measurements,
key,
dfs,
)
except KeyError as key_error:
print(
"generate_speaker: a file per speaker for {} failed with {}".format(
speaker_name, key_error
)
write_if_different(graph_content, graph_filename)
)


def generate_speakers(mako, dataframe, meta, site, use_search):
Expand All @@ -156,29 +195,20 @@ def generate_speakers(mako, dataframe, meta, site, use_search):
graph_html = mako.get_template("graph.html")
for speaker_name, origins in dataframe.items():
logger.debug("html generation for speaker_name=" + speaker_name)
try:
if extradata[speaker_name].get("skip", False):
logger.debug("skipping %s", speaker_name)
continue
generate_speaker(
dataframe, meta, site, use_search, speaker_name, origins, speaker_html, graph_html
)
except KeyError as key_error:
print("Graph generation failed for {}".format(key_error))
if speaker_name in extradata and extradata[speaker_name].get("skip", False):
logger.debug("skipping %s", speaker_name)
continue
generate_speaker(
dataframe, meta, site, use_search, speaker_name, origins, speaker_html, graph_html
)

return 0


def main():
# load all metadata from generated json file
pattern = "{}-[0-9]*.json".format(cpaths.CPATH_METADATA_JSON[:-5])
json_filenames = glob(pattern)
json_filename = None
for json_maybe in json_filenames:
check = re.match(".*[-][0-9]{5}[.]json$", json_maybe)
if check is not None:
json_filename = json_maybe
if not os.path.exists(json_filename):
json_filename = find_metadata_file()
if json_filename is None:
logger.error("Cannot find %s", json_filename)
sys.exit(1)

Expand Down
89 changes: 53 additions & 36 deletions generate_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,13 @@
--dash-ip=<dash-ip> IP for the ray dashboard to track execution
--dash-port=<dash-port> Port for the ray dashbboard
"""
import contextlib
from hashlib import md5
from itertools import groupby
import json
from glob import glob
import math
from pathlib import Path
import os
import sys
import time
Expand All @@ -70,6 +72,7 @@
cache_load,
custom_ray_init,
sort_metadata_per_date,
find_metadata_file,
)
import spinorama.constant_paths as cpaths
from spinorama.compute_estimates import estimates
Expand Down Expand Up @@ -769,51 +772,65 @@ def dict_to_json(filename, d):
return
# hash changed, remove old files
old_hash_pattern = "{}-*.json".format(filename[:-5])
for fileold in glob(old_hash_pattern):
logger.debug("remove old file %s", fileold)
os.remove(fileold)
for old_filename in glob(old_hash_pattern):
logger.debug("remove old file %s", old_filename)
os.remove(old_filename)
with open(hashed_filename, "w", encoding="utf-8") as f:
f.write(js)
f.close()

with zipfile.ZipFile(
hashed_filename + ".zip",
"w",
compression=zipfile.ZIP_DEFLATED,
allowZip64=True,
) as current_zip:
current_zip.writestr(hashed_filename, js)
# write the zip file
with zipfile.ZipFile(
hashed_filename + ".zip",
"w",
compression=zipfile.ZIP_DEFLATED,
allowZip64=True,
) as current_zip:
current_zip.writestr(hashed_filename, js)
logger.debug("generated %s and zip version", hashed_filename)

# add a link to make it easier for other scripts to find the metadata
with contextlib.suppress(OSError):
os.symlink(Path(hashed_filename).name, cpaths.CPATH_METADATA_JSON)

meta_full = {k: v for k, v in meta.items() if not v.get("skip", False)}
dict_to_json(metafile, meta_full)

# generate a short version for rapid home page charging
# TODO(pierre)
# let's check if it is faster to load slices than the full file
# partitionning is per year, each file is hashed and the hash
# is stored in the name.
# Warning: when reading the chunks you need to read them from recent to old and discard the keys you already have seen,
meta_sorted_date = list(sort_metadata_per_date(meta_full).items())
meta_sorted_date_head = dict(meta_sorted_date[0:10])
meta_sorted_date_tail = dict(meta_sorted_date[10:])

filename = metafile[:-5] + "-head.json"
dict_to_json(filename, meta_sorted_date_head)

def by_year(key):
m = meta_sorted_date_tail[key]
def_m = m["default_measurement"]
year = int(m["measurements"][def_m].get("review_published", "1970")[0:4])
# group together years without too many reviews
if year > 1970 and year < 2020:
return 2019
return year

grouped_by_year = groupby(meta_sorted_date_tail, by_year)
for year, group in grouped_by_year:
filename = "{}-{:4d}.json".format(metafile[:-5], year)
dict_to_json(filename, {k: meta_sorted_date_tail[k] for k in list(group)})

# debugjs = find_metadata_file()
# debugmeta = None
# with open(debugjs, "r") as f:
# debugmeta = json.load(f)
# print('DEBUG: size of full ==> {}'.format(len(meta.keys())))
# print('DEBUG: size of meta ==> {}'.format(len(meta_full.keys())))
# print('DEBUG: size of js ==> {}'.format(len(debugmeta.keys())))

# # generate a short version for rapid home page charging
# # TODO(pierre)
# # let's check if it is faster to load slices than the full file
# # partitionning is per year, each file is hashed and the hash
# # is stored in the name.
# # Warning: when reading the chunks you need to read them from recent to old and discard the keys you a#lready have seen,
# meta_sorted_date = list(sort_metadata_per_date(meta_full).items())
# meta_sorted_date_head = dict(meta_sorted_date[0:10])
# meta_sorted_date_tail = dict(meta_sorted_date[10:])
#
# filename = metafile[:-5] + "-head.json"
# dict_to_json(filename, meta_sorted_date_head)
#
# def by_year(key):
# m = meta_sorted_date_tail[key]
# def_m = m["default_measurement"]
# year = int(m["measurements"][def_m].get("review_published", "1970")[0:4])
# # group together years without too many reviews
# if year > 1970 and year < 2020:
# return 2019
# return year
#
# grouped_by_year = groupby(meta_sorted_date_tail, by_year)
# for year, group in grouped_by_year:
# filename = "{}-{:4d}.json".format(metafile[:-5], year)
# dict_to_json(filename, {k: meta_sorted_date_tail[k] for k in list(group)})


def main():
Expand Down
2 changes: 1 addition & 1 deletion generate_peqs.py
Original file line number Diff line number Diff line change
Expand Up @@ -588,7 +588,7 @@ def main():
if disable_ray:
df_all_speakers = cache_load_seq(filters=do_filters, smoke_test=smoke_test)
else:
df_all_speakers = cache_load(filters=do_filters, smoke_test=smoke_test, level)
df_all_speakers = cache_load(filters=do_filters, smoke_test=smoke_test, level=level)
except ValueError as v_e:
if speaker_name is not None:
print(
Expand Down
8 changes: 4 additions & 4 deletions generate_radar.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
import plotly.graph_objects as go

from spinorama.constant_paths import CPATH_METADATA_JSON, CPATH_DOCS_SPEAKERS
from generate_common import get_custom_logger, args2level
from generate_common import get_custom_logger, args2level, find_metadata_file


VERSION = 0.1
Expand Down Expand Up @@ -158,9 +158,9 @@ def print_radar(data, scale):

def main():
# load all metadata from generated json file
json_filename = CPATH_METADATA_JSON
if not os.path.exists(json_filename):
logger.error("Cannot find %s", json_filename)
json_filename = find_metadata_file()
if json_filename is None:
logger.error("Cannot find metadata file, did you ran generate_meta.py ?")
sys.exit(1)

jsmeta = None
Expand Down
Loading

0 comments on commit 46d468e

Please sign in to comment.