Many hyperlinks are disabled.
Use anonymous login
to enable hyperlinks.
Benchmark
Benchmark methodology
This was run 2 times on a AMD Ryzen 3700X with disabled boost, i.e. at a maximum and stable clock speed of 3.6GHz.
Imported was a GZIP-compressed copy of the All Cards Scryfall bulk data.
The file used for benchmarks was all-cards-20220708091601.json.gz
, so the card data was obtained on 2022-07-08.
Future benchmarks using newer data will not be comparable, as newer card data exports will continue to grow in size.
Python version used
CPython 3.10.4
Before
Total time: 170.557 s
File: mtg_proxy_printer/card_info_downloader.py
Function: _populate_database at line 280
Line # Hits Time Per Hit % Time Line Contents
==============================================================
280 def _populate_database(self, card_data: typing.Generator[JSONType, None, None]) -> int:
281 1 3.0 3.0 0.0 logger.info("About to populate the database with card data")
282 1 9.0 9.0 0.0 self.model.begin_transaction()
283 # Look up the printing filter ids only once per import
284 2 15.0 7.5 0.0 printing_filter_ids: typing.Dict[str, int] = {
285 filter_name: filter_id
286 for filter_name, filter_id
287 1 35.0 35.0 0.0 in self.model.db.execute("SELECT filter_name, filter_id FROM DisplayFilters").fetchall()}
288 1 1.0 1.0 0.0 set_wackiness_score_cache: typing.Dict[str, SetWackinessScore] = {}
289 1 1.0 1.0 0.0 skipped_cards = 0
290 1 0.0 0.0 0.0 index = 0
291 1 1.0 1.0 0.0 face_ids: IntTuples = []
292 1 1.0 1.0 0.0 db: sqlite3.Connection = self.model.db
293 374339 20906981.0 55.9 12.3 for index, card in enumerate(card_data, start=1):
294 374338 340099.0 0.9 0.2 if not self.should_run:
295 logger.info(f"Aborting card import after {index} cards due to user request.")
296 self.download_finished.emit()
297 return index
298 374338 342872.0 0.9 0.2 if card["object"] != "card":
299 logger.warning(f"Non-card found in card data during import: {card}")
300 continue
301 374338 445519.0 1.2 0.3 if _should_skip_card(card):
302 163 140.0 0.9 0.0 skipped_cards += 1
303 326 3512.0 10.8 0.0 db.execute(cached_dedent("""\
304 INSERT INTO RemovedPrintings (scryfall_id, language, oracle_id)
305 VALUES (?, ?, ?)
306 ON CONFLICT (scryfall_id) DO UPDATE
307 SET oracle_id = excluded.oracle_id,
308 language = excluded.language
309 WHERE oracle_id <> excluded.oracle_id
310 OR language <> excluded.language
311 163 223.0 1.4 0.0 ;"""), (card["id"], card["lang"], _get_oracle_id(card)))
312 163 142.0 0.9 0.0 continue
313 374175 255274.0 0.7 0.1 try:
314 374175 136619278.0 365.1 80.1 face_ids += self._parse_single_printing(card, printing_filter_ids, set_wackiness_score_cache)
315 except Exception as e:
316 logger.exception(f"Error while parsing card at position {index}. {card=}")
317 raise RuntimeError(f"Error while parsing card at position {index}: {e}")
318 374175 372985.0 1.0 0.2 if not index % 10000:
319 37 244.0 6.6 0.0 logger.debug(f"Imported {index} cards.")
320 1 578254.0 578254.0 0.3 _clean_unused_data(self.model.db, face_ids)
321 1 12.0 12.0 0.0 logger.info(f"Skipped {skipped_cards} cards during the import")
322 1 24.0 24.0 0.0 self.download_begins.emit(5, "Processing card filters")
323 2 3401603.0 1700801.5 2.0 self.model.store_current_printing_filters(
324 1 1.0 1.0 0.0 False, force_update_hidden_column=True, progress_signal=self.download_progress.emit)
325 # Store the timestamp of this import.
326 3 107.0 35.7 0.0 db.execute(cached_dedent(
327 1 0.0 0.0 0.0 """\
328 INSERT INTO LastDatabaseUpdate (reported_card_count)
329 VALUES (?)
330 """),
331 1 1.0 1.0 0.0 (index,)
332 )
333 # Populate the sqlite stat tables to give the query optimizer data to work with.
334 1 1729.0 1729.0 0.0 db.execute("ANALYZE\n")
335 1 7287808.0 7287808.0 4.3 db.commit()
336 1 2.0 2.0 0.0 return index
Total time: 134.212 s
File: mtg_proxy_printer/card_info_downloader.py
Function: _parse_single_printing at line 338
Line # Hits Time Per Hit % Time Line Contents
==============================================================
338 def _parse_single_printing(self, card: JSONType, printing_filter_ids, wackiness_score_cache):
339 374175 359325.0 1.0 0.3 language_id = _insert_language(self.model, card["lang"])
340 374175 372634.0 1.0 0.3 oracle_id = _get_oracle_id(card)
341 374175 1008829.0 2.7 0.8 card_id = _insert_card(self.model, oracle_id)
342 374175 4071888.0 10.9 3.0 set_id = _insert_set(self.model, card, wackiness_score_cache)
343 374175 21763233.0 58.2 16.2 printing_id = insert_printing(self.model, card, card_id, set_id)
344 374175 52561364.0 140.5 39.2 _insert_card_filters(self.model, printing_id, _get_card_filter_data(card), printing_filter_ids)
345 374175 53931455.0 144.1 40.2 new_face_ids = _insert_card_faces(self.model, card, language_id, printing_id)
346 374175 143480.0 0.4 0.1 return new_face_ids
Total time: 34.5982 s
File: mtg_proxy_printer/card_info_downloader.py
Function: _insert_card_filters at line 575
Line # Hits Time Per Hit % Time Line Contents
==============================================================
575 def _insert_card_filters(
576 model: CardDatabase, printing_id: int, filter_data: typing.Dict[str, bool],
577 printing_filter_ids: typing.Dict[str, int]):
578 748350 33692351.0 45.0 97.4 model.db.executemany(
579 374175 205228.0 0.5 0.6 cached_dedent("""\
580 INSERT OR REPLACE INTO PrintingDisplayFilter (printing_id, filter_id, filter_applies)
581 VALUES (?, ?, ?)
582 """),
583 748350 490876.0 0.7 1.4 ((printing_id, printing_filter_ids[filter_name], filter_applies)
584 374175 209724.0 0.6 0.6 for filter_name, filter_applies in filter_data.items())
585 )
After
Total time: 114.797 s
File: mtg_proxy_printer/card_info_downloader.py
Function: _populate_database at line 280
Line # Hits Time Per Hit % Time Line Contents
==============================================================
280 def _populate_database(self, card_data: typing.Generator[JSONType, None, None]) -> int:
281 1 4.0 4.0 0.0 logger.info("About to populate the database with card data")
282 1 14.0 14.0 0.0 self.model.begin_transaction()
283 # Look up the printing filter ids only once per import
284 2 23.0 11.5 0.0 printing_filter_ids: typing.Dict[str, int] = {
285 filter_name: filter_id
286 for filter_name, filter_id
287 1 58.0 58.0 0.0 in self.model.db.execute("SELECT filter_name, filter_id FROM DisplayFilters").fetchall()}
288 1 2.0 2.0 0.0 set_wackiness_score_cache: typing.Dict[str, SetWackinessScore] = {}
289 1 1.0 1.0 0.0 skipped_cards = 0
290 1 1.0 1.0 0.0 index = 0
291 1 1.0 1.0 0.0 face_ids: IntTuples = []
292 1 1.0 1.0 0.0 db: sqlite3.Connection = self.model.db
293 # Will be re-populated while iterating over the card data. Axing the previous data is far cheaper than trying
294 # to update it in-place by removing up to number-of-available-filters entries per each individual card,
295 # just to make sure that rare un-banned cards are updated properly.
296 1 28.0 28.0 0.0 db.execute("DELETE FROM PrintingDisplayFilter\n")
297 374339 20713590.0 55.3 18.0 for index, card in enumerate(card_data, start=1):
298 374338 336928.0 0.9 0.3 if not self.should_run:
299 logger.info(f"Aborting card import after {index} cards due to user request.")
300 self.download_finished.emit()
301 return index
302 374338 320212.0 0.9 0.3 if card["object"] != "card":
303 logger.warning(f"Non-card found in card data during import: {card}")
304 continue
305 374338 435528.0 1.2 0.4 if _should_skip_card(card):
306 163 115.0 0.7 0.0 skipped_cards += 1
307 326 3290.0 10.1 0.0 db.execute(cached_dedent("""\
308 INSERT INTO RemovedPrintings (scryfall_id, language, oracle_id)
309 VALUES (?, ?, ?)
310 ON CONFLICT (scryfall_id) DO UPDATE
311 SET oracle_id = excluded.oracle_id,
312 language = excluded.language
313 WHERE oracle_id <> excluded.oracle_id
314 OR language <> excluded.language
315 163 251.0 1.5 0.0 ;"""), (card["id"], card["lang"], _get_oracle_id(card)))
316 163 144.0 0.9 0.0 continue
317 374175 243929.0 0.7 0.2 try:
318 374175 85852112.0 229.4 74.8 face_ids += self._parse_single_printing(card, printing_filter_ids, set_wackiness_score_cache)
319 except Exception as e:
320 logger.exception(f"Error while parsing card at position {index}. {card=}")
321 raise RuntimeError(f"Error while parsing card at position {index}: {e}")
322 374175 357011.0 1.0 0.3 if not index % 10000:
323 37 241.0 6.5 0.0 logger.debug(f"Imported {index} cards.")
324 1 579530.0 579530.0 0.5 _clean_unused_data(self.model.db, face_ids)
325 1 12.0 12.0 0.0 logger.info(f"Skipped {skipped_cards} cards during the import")
326 1 24.0 24.0 0.0 self.download_begins.emit(5, "Processing card filters")
327 2 2407962.0 1203981.0 2.1 self.model.store_current_printing_filters(
328 1 2.0 2.0 0.0 False, force_update_hidden_column=True, progress_signal=self.download_progress.emit)
329 # Store the timestamp of this import.
330 3 105.0 35.0 0.0 db.execute(cached_dedent(
331 1 1.0 1.0 0.0 """\
332 INSERT INTO LastDatabaseUpdate (reported_card_count)
333 VALUES (?)
334 """),
335 1 1.0 1.0 0.0 (index,)
336 )
337 # Populate the sqlite stat tables to give the query optimizer data to work with.
338 1 1775.0 1775.0 0.0 db.execute("ANALYZE\n")
339 1 3543949.0 3543949.0 3.1 db.commit()
340 1 3.0 3.0 0.0 return index
Total time: 83.4674 s
File: mtg_proxy_printer/card_info_downloader.py
Function: _parse_single_printing at line 342
Line # Hits Time Per Hit % Time Line Contents
==============================================================
342 def _parse_single_printing(self, card: JSONType, printing_filter_ids, wackiness_score_cache):
343 374175 347807.0 0.9 0.4 language_id = _insert_language(self.model, card["lang"])
344 374175 362442.0 1.0 0.4 oracle_id = _get_oracle_id(card)
345 374175 793475.0 2.1 1.0 card_id = _insert_card(self.model, oracle_id)
346 374175 3979204.0 10.6 4.8 set_id = _insert_set(self.model, card, wackiness_score_cache)
347 374175 18003759.0 48.1 21.6 printing_id = insert_printing(self.model, card, card_id, set_id)
348 374175 22788682.0 60.9 27.3 _insert_card_filters(self.model, printing_id, _get_card_filter_data(card), printing_filter_ids)
349 374175 37048531.0 99.0 44.4 new_face_ids = _insert_card_faces(self.model, card, language_id, printing_id)
350 374175 143520.0 0.4 0.2 return new_face_ids
Total time: 4.48453 s
File: mtg_proxy_printer/card_info_downloader.py
Function: _insert_card_filters at line 579
Line # Hits Time Per Hit % Time Line Contents
==============================================================
579 def _insert_card_filters(
580 model: CardDatabase, printing_id: int, filter_data: typing.Dict[str, bool],
581 printing_filter_ids: typing.Dict[str, int]):
582 748350 3598343.0 4.8 80.2 model.db.executemany(
583 374175 161529.0 0.4 3.6 "INSERT OR IGNORE INTO PrintingDisplayFilter (printing_id, filter_id) VALUES (?, ?)\n",
584 748350 512545.0 0.7 11.4 ((printing_id, printing_filter_ids[filter_name])
585 374175 212110.0 0.6 4.7 for filter_name, filter_applies in filter_data.items() if filter_applies)
586 )
6 check-ins related to "carddb_optimization"
2022-07-08
| ||
17:46 | Optimized the card database size by only storing positive printing filters. Completes [4d9d5ad8f05730a5]. See [/wiki?name=branch/carddb_optimization&p] for benchmark details. check-in: c54adb5109 user: thomas tags: trunk | |
17:42 | Added changelog entry Closed-Leaf check-in: 98551c9879 user: thomas tags: carddb_optimization | |
17:30 | Further optimization during card data import: Instead of deleting excess entries in PrintingDisplayFilter individually, simply clear the whole table and re-populate it. check-in: 1dc70b4ec9 user: thomas tags: carddb_optimization | |
16:56 | Database schema: Removed duplicate release_date column in the AllPrintings view. check-in: 04f30c24ae user: thomas tags: carddb_optimization | |
16:52 | Optimize the handling of printing filters. The database no longer stores the full cross product in the PrintingDisplayFilter table. Instead, only positive values (a filter applies to a printing) is stored. The negative case is now implied. This reduces the total table size from ~6.6 million to ~126 thousand entries. This reduces the database file size from ~230 MiB to 160 MiB. check-in: 0b50f651ab user: thomas tags: carddb_optimization | |
13:12 | Document.save_to_disk(): Added three logging lines. check-in: 4efb66cf41 user: thomas tags: trunk | |