diff options
author | Anton Luka Šijanec <anton@sijanec.eu> | 2022-11-07 12:43:33 +0100 |
---|---|---|
committer | Anton Luka Šijanec <anton@sijanec.eu> | 2022-11-07 12:43:33 +0100 |
commit | d87288573e19a7aca802d172e80bbafbf692dc71 (patch) | |
tree | 0041df2568e0bc63012fb742f0291a2afe6f1f19 | |
parent | increased how many failed acsms not 200 in a row to stop to 100 (diff) | |
download | biblos-stat-d87288573e19a7aca802d172e80bbafbf692dc71.tar biblos-stat-d87288573e19a7aca802d172e80bbafbf692dc71.tar.gz biblos-stat-d87288573e19a7aca802d172e80bbafbf692dc71.tar.bz2 biblos-stat-d87288573e19a7aca802d172e80bbafbf692dc71.tar.lz biblos-stat-d87288573e19a7aca802d172e80bbafbf692dc71.tar.xz biblos-stat-d87288573e19a7aca802d172e80bbafbf692dc71.tar.zst biblos-stat-d87288573e19a7aca802d172e80bbafbf692dc71.zip |
-rwxr-xr-x | gather.py | 20 |
1 files changed, 14 insertions, 6 deletions
@@ -10,7 +10,7 @@ try: except ModuleNotFoundError: raise ModuleNotFoundError("emerge dev-python/sqlalchemy or pip install SQLAlchemy") try: - from bs4 import BeautifulSoup + from bs4 import BeautifulSoup, FeatureNotFound except ModuleNotFoundError: raise ModuleNotFoundError("emerge dev-python/beautifulsoup4 or pip install beautifulsoup4") @@ -29,7 +29,7 @@ class Book(Base): creator = Column(String, nullable=True, doc="author of the book, dc:creator in acsm") publisher = Column(String, nullable=True, doc="publisher of the book, dc:publisher in acsm") identifier = Column(String, nullable=True, doc="if dc:identifier can't be derived from isbn, it's stored here. if dc:identifier element is missing, a literal string noidentifier is stored.") - thumbnail_extension = Column(String, nullable=True, doc="thumbnails come in predictable URLs, derived from ISBN, apart from the extension. I've observed both jpg and png") + thumbnail_extension = Column(String, nullable=True, doc="thumbnails come in predictable URLs, derived from ISBN, apart from the extension. I've observed both jpg and png, may be None if there's no thumbnailURL element") format = Column(String, nullable=True, doc="format of the file. I've seen application/pdf and application/epub+zip") language = Column(String, nullable=True, doc="language of the book. I've seen sl.") borrows = relationship("Borrow", back_populates="book"); @@ -106,7 +106,10 @@ try: force_acsm_id = acsm_id+1 failed_acsms += 1 else: - acsm = BeautifulSoup(r.text, "xml", from_encoding="UTF-8") + try: + acsm = BeautifulSoup(r.text, "xml", from_encoding="UTF-8") + except FeatureNotFound: + raise FeatureNotFound("pip3 install lxml") ft = acsm.fulfillmentToken expected = f"ACS-BIBL-L-{acsm_id}" if ft.transaction.string != expected: @@ -127,9 +130,14 @@ try: raise ValueError(f"expected {expected} in ft.resourceItemInfo.licenseToken.resource.string but instead received {ft.resourceItemInfo.licenseToken.resource.string} in acsm {acsm_id}") uuid = expected.split(":").pop() expected = f"https://cs.alliance.inkbook.eu/books/{uuid}." - if ft.resourceItemInfo.metadata.thumbnailURL.string.startswith(expected) != True: - raise ValueError(f"expected {expected} in ft.resourceItemInfo.metadata.thumbnailURL.string but instead received {ft.resourceItemInfo.metadata.thumbnailURL.string} in acsm {acsm_id}") - thumbnail_extension = ft.resourceItemInfo.metadata.thumbnailURL.string.split(".").pop() + try: + if ft.resourceItemInfo.metadata.thumbnailURL.string.startswith(expected) != True: + raise ValueError(f"expected {expected} in ft.resourceItemInfo.metadata.thumbnailURL.string but instead received {ft.resourceItemInfo.metadata.thumbnailURL.string} in acsm {acsm_id}") + thumbnail_extension = ft.resourceItemInfo.metadata.thumbnailURL.string.split(".").pop() + except AttributeError: + thumbnail_extension = None + if ft.resourceItemInfo.metadata.thumbnailURL != None: + raise ValueError(f"thumbnailURL actually exists, but it failed to be parsed in acsm {acsm_id}") duration = int(ft.resourceItemInfo.licenseToken.permissions.display.duration.string) if duration != int(ft.resourceItemInfo.licenseToken.permissions.play.duration.string): raise ValueError(f"expected {duration} in fr.int(resourceItemInfo.licenseToken.permissions.play.duration.string) but instead received {int(resourceItemInfo.licenseToken.permissions.play.duration.string)} in acsm {acsm_id}") |