{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from lxml import etree\n", "import requests\n", "import os\n", "import glob\n", "import time\n", "import sys" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "sys.path.append(os.path.abspath(\"C:/Users/calvotello/Dropbox/MTB/Göttingen/research/\"))\n", "\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "from librarian_robot import transform_formats\n", "\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[1;31mSignature:\u001b[0m\n", "\u001b[0mtransform_formats\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfrom_picaxml_with_multiple_entries_to_parquet\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m\n", "\u001b[0m \u001b[0mwdir\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mstr\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;34m''\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\n", "\u001b[0m \u001b[0mending\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mstr\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;34m''\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\n", "\u001b[0m \u001b[0mxpaths_fields\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m{\u001b[0m\u001b[1;34m'ppn'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;34m'./pica:datafield[@tag=\"003@\"]/pica:subfield[@code=\"0\"]/text()'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'medium'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;34m'./pica:datafield[@tag=\"002@\"]/pica:subfield[@code=\"0\"]/text()'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'title'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;34m'./pica:datafield[@tag=\"021A\"]/pica:subfield[@code=\"a\"]/text()'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'title_supplement'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;34m'./pica:datafield[@tag=\"021A\"]/pica:subfield[@code=\"d\"]/text()'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'title_multiple_bands'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;34m'./pica:datafield[@tag=\"036D\"]/pica:subfield[@code=\"a\"]/text()'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'year'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;34m'./pica:datafield[@tag=\"011@\"]/pica:subfield[@code=\"a\"]/text()'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'entry_first'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;34m'./pica:datafield[@tag=\"001A\"]/pica:subfield[@code=\"0\"]/text()'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'author_first_name'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;34m'./pica:datafield[@tag=\"028A\"]/pica:subfield[@code=\"d\" or @code=\"D\" ]/text()'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'author_last_name'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;34m'./pica:datafield[@tag=\"028A\"]/pica:subfield[@code=\"a\" or @code=\"A\" ]/text()'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'author_gnd_id'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;34m'./pica:datafield[@tag=\"028A\"]/pica:subfield[@code=\"9\"]/text()'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'editor_first_name'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;34m'./pica:datafield[@tag=\"028C\"]/pica:subfield[@code=\"d\" or @code=\"D\" ]/text()'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'editor_last_name'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;34m'./pica:datafield[@tag=\"028C\"]/pica:subfield[@code=\"a\" or @code=\"A\" ]/text()'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'editor_gnd_id'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;34m'./pica:datafield[@tag=\"028C\"]/pica:subfield[@code=\"9\"]/text()'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'isbn'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;34m'./pica:datafield[@tag=\"004A\"]/pica:subfield[@code=\"0\"]/text()'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'ILNs'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;34m'./pica:datafield[@tag=\"001@\"]/pica:subfield[@code=\"0\"]/text()'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'content_type'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;34m'//pica:datafield[@tag=\"013D\"]/pica:subfield[@code=\"a\"]/text()'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'publisher'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;34m'./pica:datafield[@tag=\"033A\"]/pica:subfield[@code=\"n\"]/text()'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'language_text'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;34m'./pica:datafield[@tag=\"010@\"]/pica:subfield[@code=\"a\"]/text()'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'language_original'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;34m'./pica:datafield[@tag=\"010@\"]/pica:subfield[@code=\"c\"]/text()'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'pages'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;34m'./pica:datafield[@tag=\"034D\"]/pica:subfield[@code=\"a\"]/text()'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'format'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;34m'./pica:datafield[@tag=\"034I\"]/pica:subfield[@code=\"a\"]/text()'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'comment_isbn'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;34m'./pica:datafield[@tag=\"004A\"]/pica:subfield[@code=\"f\"]/text()'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'place_publication'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;34m'./pica:datafield[@tag=\"033A\"]/pica:subfield[@code=\"p\"]/text()'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'summary'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;34m'./pica:datafield[@tag=\"047I\"]/pica:subfield[@code=\"a\"]/text()'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'title_continuing_resource'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;34m'./pica:datafield[@tag=\"036E\"]/pica:subfield[@code=\"a\"]/text()'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'work_ppn'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;34m'./pica:datafield[@tag=\"022A\"]/pica:subfield[@code=\"9\"]/text()'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'work_info'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;34m'./pica:datafield[@tag=\"022A\"]/pica:subfield[@code=\"8\"]/text()'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'work_title'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;34m'./pica:datafield[@tag=\"022A\"]/pica:subfield[@code=\"a\"]/text()'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'expression_ppn'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;34m'./pica:datafield[@tag=\"039M\"]/pica:subfield[@code=\"9\"]/text()'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'expression_info'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;34m'./pica:datafield[@tag=\"039M\"]/pica:subfield[@code=\"8\"]/text()'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'expression_title'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;34m'./pica:datafield[@tag=\"039M\"]/pica:subfield[@code=\"a\"]/text()'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'DDC_notation'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;34m'./pica:datafield[@tag=\"045F\"]/pica:subfield[@code=\"a\"]/text()'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'DDC_sachgruppe'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;34m'./pica:datafield[@tag=\"045G\"]/pica:subfield[@code=\"a\"]/text()'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'DDC_grundnotation'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;34m'./pica:datafield[@tag=\"045H\"]/pica:subfield[@code=\"c\"]/text()'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'BK_ppn'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;34m'./pica:datafield[@tag=\"045Q\"]/pica:subfield[@code=\"9\"]/text()'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'BK_notation'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;34m'./pica:datafield[@tag=\"045Q\"]/pica:subfield[@code=\"a\"]/text()'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'BK_j'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;34m'./pica:datafield[@tag=\"045Q\"]/pica:subfield[@code=\"j\"]/text()'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'RVK_ppn'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;34m'./pica:datafield[@tag=\"045R\"]/pica:subfield[@code=\"9\"]/text()'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'RVK_notation'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;34m'./pica:datafield[@tag=\"045R\"]/pica:subfield[@code=\"a\"]/text()'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'RVK_j'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;34m'./pica:datafield[@tag=\"045R\"]/pica:subfield[@code=\"j\"]/text()'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'RVK_k'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;34m'./pica:datafield[@tag=\"045R\"]/pica:subfield[@code=\"k\"]/text()'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'keyword_RSWK'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;34m'./pica:datafield[@tag=\"041A\"]/pica:subfield[@code=\"a\" or @code=\"A\"]/text()'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'keyword_K10plus'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;34m'./pica:datafield[@tag=\"044K\"]/pica:subfield[@code=\"a\" or @code=\"A\"]/text()'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'keyword_project'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;34m'./pica:datafield[@tag=\"041L\"]/pica:subfield[@code=\"a\" or @code=\"A\"]/text()'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'keyword_local'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;34m'./pica:datafield[@tag=\"044Z\"]/pica:subfield[@code=\"a\" or @code=\"A\"]/text()'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'keyword_045D'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;34m'./pica:datafield[@tag=\"045D\"]/pica:subfield[@code=\"a\" or @code=\"A\"]/text()'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'keyword_LoC'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;34m'./pica:datafield[@tag=\"044A\"]/pica:subfield[@code=\"a\" or @code=\"A\"]/text()'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'lcc_notation'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;34m'./pica:datafield[@tag=\"045A\"]/pica:subfield[@code=\"a\"]/text()'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'klassifikationssystem_system'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;34m'./pica:datafield[@tag=\"045X\"]/pica:subfield[@code=\"i\"]/text()'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'klassifikationssystem_notation'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;34m'./pica:datafield[@tag=\"045X\"]/pica:subfield[@code=\"a\"]/text()'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'uri_description'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;34m'./pica:datafield[@tag=\"209R\"]/pica:subfield[@code=\"y\"]/text()'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'uri'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;34m'./pica:datafield[@tag=\"209R\"]/pica:subfield[@code=\"a\"]/text()'\u001b[0m\u001b[1;33m}\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\n", "\u001b[0m \u001b[0mxpath_entry\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'/zs:searchRetrieveResponse/zs:records/zs:record/zs:recordData/pica:record'\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\n", "\u001b[0m \u001b[0mnamespaces\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m{\u001b[0m\u001b[1;34m'pica'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;34m'info:srw/schema/5/picaXML-v1.0'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'zs'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;34m'http://www.loc.gov/zing/srw/'\u001b[0m\u001b[1;33m}\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\n", "\u001b[0m \u001b[0moutdir\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'./../data/'\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\n", "\u001b[0m \u001b[0mverbose\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mFalse\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\n", "\u001b[0m \u001b[0mwait_if_existing\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\n", "\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;31mDocstring:\u001b[0m \n", "\u001b[1;31mFile:\u001b[0m c:\\users\\calvotello\\dropbox\\mtb\\göttingen\\research\\librarian_robot\\transform_formats.py\n", "\u001b[1;31mType:\u001b[0m function\n" ] } ], "source": [ "transform_formats.from_picaxml_with_multiple_entries_to_parquet?" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "xpaths_fields = {\n", " \n", " 'ppn' : './datafield[@tag=\"003@\"]/subfield[@code=\"0\"]/text()',\n", " 'medium' : './datafield[@tag=\"002@\"]/subfield[@code=\"0\"]/text()',\n", " 'title' : './datafield[@tag=\"021A\"]/subfield[@code=\"a\"]/text()',\n", " 'title_supplement' : './datafield[@tag=\"021A\"]/subfield[@code=\"d\"]/text()',\n", " 'year' : './datafield[@tag=\"011@\"]/subfield[@code=\"a\"]/text()',\n", " 'entry_first' : './datafield[@tag=\"001A\"]/subfield[@code=\"0\"]/text()',\n", " 'author_first_name' : './datafield[@tag=\"028A\"]/subfield[@code=\"d\" or @code=\"D\" ]/text()', \n", " 'author_last_name' : './datafield[@tag=\"028A\"]/subfield[@code=\"a\" or @code=\"A\" ]/text()',\n", " 'author_gnd_id' : './datafield[@tag=\"028A\"]/subfield[@code=\"9\"]/text()',\n", " 'editor_first_name' : './datafield[@tag=\"028C\"]/subfield[@code=\"d\" or @code=\"D\" ]/text()', \n", " 'editor_last_name' : './datafield[@tag=\"028C\"]/subfield[@code=\"a\" or @code=\"A\" ]/text()',\n", " 'editor_gnd_id' : './datafield[@tag=\"028C\"]/subfield[@code=\"9\"]/text()',\n", " #'author_gnd_notation' : './datafield[@tag=\"028A\"]/subfield[@code=\"8\"]/text()',\n", " 'isbn' : './datafield[@tag=\"004A\"]/subfield[@code=\"0\"]/text()',\n", " 'ILNs' : './datafield[@tag=\"001@\"]/subfield[@code=\"0\"]/text()',\n", " 'content_type_ppn' : './datafield[@tag=\"013D\"]/subfield[@code=\"9\"]/text()',\n", " 'content_type' : './datafield[@tag=\"013D\"]/subfield[@code=\"a\" or @code=\"8\"]/text()',\n", " \n", " \n", " 'publisher' : './datafield[@tag=\"033A\"]/subfield[@code=\"n\"]/text()',\n", " 'language_text' : './datafield[@tag=\"010@\"]/subfield[@code=\"a\"]/text()',\n", " #'language_original' : './datafield[@tag=\"010@\"]/subfield[@code=\"c\"]/text()',\n", " 'pages' : './datafield[@tag=\"034D\"]/subfield[@code=\"a\"]/text()',\n", " 'comment_isbn' : './datafield[@tag=\"004A\"]/subfield[@code=\"f\"]/text()',\n", " #'issn' : './datafield[@tag=\"005A\"]/subfield[@code=\"0\"]/text()',\n", " #'comment_issn' : './datafield[@tag=\"005A\"]/subfield[@code=\"f\"]/text()',\n", " 'place_publication' : './datafield[@tag=\"033A\"]/subfield[@code=\"p\"]/text()',\n", " 'summary' : './datafield[@tag=\"047I\"]/subfield[@code=\"a\"]/text()',\n", " 'title_continuing_resource' : './datafield[@tag=\"036E\"]/subfield[@code=\"a\"]/text()',\n", "\n", "\n", " 'work_ppn' : './datafield[@tag=\"022A\"]/subfield[@code=\"9\"]/text()',\n", " 'work_info' : './datafield[@tag=\"022A\"]/subfield[@code=\"8\"]/text()',\n", " 'work_title' : './datafield[@tag=\"022A\"]/subfield[@code=\"a\"]/text()',\n", "\n", " #'expression_ppn' : './datafield[@tag=\"039M\"]/subfield[@code=\"9\"]/text()',\n", " #'expression_info' : './datafield[@tag=\"039M\"]/subfield[@code=\"8\"]/text()',\n", " #'expression_title' : './datafield[@tag=\"039M\"]/subfield[@code=\"a\"]/text()',\n", " \n", " 'DDC_notation' : './datafield[@tag=\"045F\"]/subfield[@code=\"a\"]/text()',\n", " 'DDC_sachgruppe_a' : './datafield[@tag=\"045E\"]/subfield[@code=\"a\"]/text()',\n", " 'DDC_sachgruppe_b' : './datafield[@tag=\"045E\"]/subfield[@code=\"b\"]/text()',\n", " 'DDC_sachgruppe_c' : './datafield[@tag=\"045E\"]/subfield[@code=\"c\"]/text()',\n", " #'DDC_grundnotation' : './datafield[@tag=\"045H\"]/subfield[@code=\"c\"]/text()',\n", "\n", " #'BK_ppn' : './datafield[@tag=\"045Q\"]/subfield[@code=\"9\"]/text()',\n", " #'BK_notation' : './datafield[@tag=\"045Q\"]/subfield[@code=\"a\"]/text()',\n", " #'BK_j' : './datafield[@tag=\"045Q\"]/subfield[@code=\"j\"]/text()',\n", "\n", " 'RVK_ppn' : './datafield[@tag=\"045Z\"]/subfield[@code=\"9\"]/text()',\n", " 'RVK_notation' : './datafield[@tag=\"045Z\"]/subfield[@code=\"8\"]/text()',\n", " #'RVK_j' : './datafield[@tag=\"045R\"]/subfield[@code=\"j\"]/text()',\n", " #'RVK_k' : './datafield[@tag=\"045R\"]/subfield[@code=\"k\"]/text()',\n", "\n", " 'keyword_fremd_ppn' : './datafield[@tag=\"041A\"]/subfield[@code=\"9\"]/text()',\n", " 'keyword_fremd' : './datafield[@tag=\"041A\"]/subfield[@code=\"a\" or @code=\"8\"]/text()',\n", "\n", " 'keyword_einzel_ppn' : './datafield[@tag=\"044K\"]/subfield[@code=\"9\"]/text()',\n", " 'keyword_einzel' : './datafield[@tag=\"044K\"]/subfield[@code=\"a\" or @code=\"8\"]/text()',\n", "\n", " 'keyword_BDSL_a' : './datafield[@tag=\"044K\"]/subfield[@code=\"a\"]/text()',\n", " 'keyword_BDSL_p' : './datafield[@tag=\"044K\"]/subfield[@code=\"p\"]/text()',\n", " 'keyword_BDSL_t' : './datafield[@tag=\"044K\"]/subfield[@code=\"t\"]/text()',\n", " 'keyword_BDSL_s' : './datafield[@tag=\"044K\"]/subfield[@code=\"s\"]/text()',\n", "\n", " 'keyword_Fremddatenlieferanten_lieferanten' : './datafield[@tag=\"044K\"]/subfield[@code=\"2\"]/text()',\n", " 'keyword_Fremddatenlieferanten' : './datafield[@tag=\"044K\"]/subfield[@code=\"a\"]/text()',\n", "\n", " 'lcc_notation' : './datafield[@tag=\"045A\"]/subfield[@code=\"a\"]/text()',\n", " #'klassifikationssystem_hessisch_bib' : './datafield[@tag=\"045X\"]/subfield[@code=\"8\"]/text()',\n", " #'klassifikationssystem_hessisch_bib_id' : './datafield[@tag=\"045X\"]/subfield[@code=\"6\"]/text()', \n", "\n", " #'uri_description' : './datafield[@tag=\"209R\"]/subfield[@code=\"y\"]/text()',\n", " #'uri' : './datafield[@tag=\"209R\"]/subfield[@code=\"a\"]/text()',\n", "\n", " #'GOK_ppn' : './datafield[@tag=\"145Z\"][./subfield[@code=\"V\"]/text()=\"Tev\"]/subfield[@code=\"9\"]/text()',\n", " #'GOK_notation' : './datafield[@tag=\"145Z\"][./subfield[@code=\"V\"]/text()=\"Tev\"]/subfield[@code=\"a\"]/text()',\n", " #'GOK_j' : './datafield[@tag=\"145Z\"][./subfield[@code=\"V\"]/text()=\"Tev\"]/subfield[@code=\"j\"]/text()',\n", "\n", " 'signatur_place' : './datafield[@tag=\"209A\"]/subfield[@code=\"f\"]/text()',\n", " 'signatur' : './datafield[@tag=\"209A\"]/subfield[@code=\"a\"]/text()',\n", " 'signatur_date' : './datafield[@tag=\"208@\"]/subfield[@code=\"a\"]/text()',\n", "\n", " #'Abrufzeichen' : './datafield[@tag=\"209O\"]/subfield[@code=\"a\"]/text()',\n", " }\n", "\n" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "root = etree.parse(\"P:/hebis/data/picaxml/rvk/pica_rvp_IA/1_501_pica_rvp_IA.picaxml\").getroot()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['10639570X',\n", " '106395513',\n", " '105490458',\n", " '105202207',\n", " '103555625',\n", " '103334742',\n", " '102761337',\n", " '102280541',\n", " '101887167',\n", " '101886861']" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "root.xpath('/srw:searchRetrieveResponse/srw:records/srw:record/srw:recordData/record/datafield[@tag=\"003@\"]/subfield[@code=\"0\"]/text()', namespaces = {'srw' : \"http://www.loc.gov/zing/srw/\"},)[0:10]" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "./../data/picaxml/rvk\\pica_rvp_IA/\n", "/rvk\\pica_rvp_IA/\n", "3\n", "3\n", "['./../data/parquets//rvk/pica_rvp_IA/0_1_pica_rvp_IA.parquet', './../data/parquets//rvk/pica_rvp_IA/1_501_pica_rvp_IA.parquet', './../data/parquets//rvk/pica_rvp_IA/2_1001_pica_rvp_IA.parquet']\n", "path: ./../data/picaxml/rvk\\pica_rvp_IA/0_1_pica_rvp_IA.picaxml\n" ] } ], "source": [ "for wdir in glob.glob(\"./../data/picaxml/rvk/*\"):\n", " \n", " wdir = wdir + \"/\"\n", " print(wdir)\n", " start_time = time.time()\n", "\n", " transform_formats.from_picaxml_with_multiple_entries_to_parquet(\n", " wdir = wdir ,\n", " ending = \".picaxml\",\n", " xpaths_fields = xpaths_fields,\n", " xpath_entry = '/srw:searchRetrieveResponse/srw:records/srw:record/srw:recordData/record',\n", " namespaces = {'srw' : \"http://www.loc.gov/zing/srw/\"},\n", " outdir = \"./../data/parquets/\",\n", " wait_if_existing = False,\n", " )" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "df = pd.read_parquet(\"./../data/parquets/sachgruppen/pica_sgt_440/1_501_pica_sgt_440.parquet\")" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DDC_sachgruppe_a 500\n", "DDC_sachgruppe_b 500\n", "keyword_Fremddatenlieferanten_lieferanten 500\n", "keyword_BDSL_s 500\n", "keyword_BDSL_t 500\n", "keyword_BDSL_p 500\n", "work_ppn 496\n", "work_info 496\n", "lcc_notation 495\n", "keyword_BDSL_a 493\n", "keyword_Fremddatenlieferanten 493\n", "work_title 487\n", "keyword_fremd 436\n", "keyword_fremd_ppn 436\n", "summary 416\n", "editor_first_name 415\n", "editor_last_name 415\n", "DDC_notation 396\n", "author_last_name 360\n", "author_first_name 360\n", "editor_gnd_id 315\n", "author_gnd_id 306\n", "isbn 303\n", "comment_isbn 268\n", "title_supplement 238\n", "content_type 229\n", "content_type_ppn 229\n", "title_continuing_resource 229\n", "RVK_notation 223\n", "RVK_ppn 223\n", "signatur_place 197\n", "signatur 196\n", "keyword_einzel_ppn 159\n", "keyword_einzel 158\n", "pages 43\n", "title 10\n", "publisher 8\n", "place_publication 2\n", "medium 0\n", "year 0\n", "signatur_date 0\n", "entry_first 0\n", "ILNs 0\n", "language_text 0\n", "DDC_sachgruppe_c 0\n", "ppn 0\n", "dtype: int64" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.isna().sum().sort_values(ascending=False)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "./../data/picaxml/rvk\\pica_rvp_IA/\n", "/rvk\\pica_rvp_IA/\n", "3\n", "0\n", "[]\n", "path: ./../data/picaxml/rvk\\pica_rvp_IA/0_1_pica_rvp_IA.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IA/1_501_pica_rvp_IA.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IA/2_1001_pica_rvp_IA.picaxml\n", "[]\n", "./../data/picaxml/rvk\\pica_rvp_IB/\n", "/rvk\\pica_rvp_IB/\n", "11\n", "0\n", "[]\n", "path: ./../data/picaxml/rvk\\pica_rvp_IB/0_1_pica_rvp_IB.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IB/10_5001_pica_rvp_IB.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IB/1_501_pica_rvp_IB.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IB/2_1001_pica_rvp_IB.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IB/3_1501_pica_rvp_IB.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IB/4_2001_pica_rvp_IB.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IB/5_2501_pica_rvp_IB.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IB/6_3001_pica_rvp_IB.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IB/7_3501_pica_rvp_IB.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IB/8_4001_pica_rvp_IB.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IB/9_4501_pica_rvp_IB.picaxml\n", "[]\n", "./../data/picaxml/rvk\\pica_rvp_ID/\n", "/rvk\\pica_rvp_ID/\n", "38\n", "0\n", "[]\n", "path: ./../data/picaxml/rvk\\pica_rvp_ID/0_1_pica_rvp_ID.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_ID/10_5001_pica_rvp_ID.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_ID/11_5501_pica_rvp_ID.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_ID/12_6001_pica_rvp_ID.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_ID/13_6501_pica_rvp_ID.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_ID/14_7001_pica_rvp_ID.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_ID/15_7501_pica_rvp_ID.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_ID/16_8001_pica_rvp_ID.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_ID/17_8501_pica_rvp_ID.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_ID/18_9001_pica_rvp_ID.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_ID/19_9501_pica_rvp_ID.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_ID/1_501_pica_rvp_ID.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_ID/20_10001_pica_rvp_ID.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_ID/21_10501_pica_rvp_ID.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_ID/22_11001_pica_rvp_ID.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_ID/23_11501_pica_rvp_ID.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_ID/24_12001_pica_rvp_ID.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_ID/25_12501_pica_rvp_ID.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_ID/26_13001_pica_rvp_ID.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_ID/27_13501_pica_rvp_ID.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_ID/28_14001_pica_rvp_ID.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_ID/29_14501_pica_rvp_ID.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_ID/2_1001_pica_rvp_ID.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_ID/30_15001_pica_rvp_ID.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_ID/31_15501_pica_rvp_ID.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_ID/32_16001_pica_rvp_ID.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_ID/33_16501_pica_rvp_ID.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_ID/34_17001_pica_rvp_ID.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_ID/35_17501_pica_rvp_ID.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_ID/36_18001_pica_rvp_ID.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_ID/37_18501_pica_rvp_ID.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_ID/3_1501_pica_rvp_ID.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_ID/4_2001_pica_rvp_ID.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_ID/5_2501_pica_rvp_ID.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_ID/6_3001_pica_rvp_ID.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_ID/7_3501_pica_rvp_ID.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_ID/8_4001_pica_rvp_ID.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_ID/9_4501_pica_rvp_ID.picaxml\n", "[]\n", "./../data/picaxml/rvk\\pica_rvp_IE/\n", "/rvk\\pica_rvp_IE/\n", "16\n", "0\n", "[]\n", "path: ./../data/picaxml/rvk\\pica_rvp_IE/0_1_pica_rvp_IE.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IE/10_5001_pica_rvp_IE.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IE/11_5501_pica_rvp_IE.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IE/12_6001_pica_rvp_IE.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IE/13_6501_pica_rvp_IE.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IE/14_7001_pica_rvp_IE.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IE/15_7501_pica_rvp_IE.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IE/1_501_pica_rvp_IE.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IE/2_1001_pica_rvp_IE.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IE/3_1501_pica_rvp_IE.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IE/4_2001_pica_rvp_IE.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IE/5_2501_pica_rvp_IE.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IE/6_3001_pica_rvp_IE.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IE/7_3501_pica_rvp_IE.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IE/8_4001_pica_rvp_IE.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IE/9_4501_pica_rvp_IE.picaxml\n", "[]\n", "./../data/picaxml/rvk\\pica_rvp_IF/\n", "/rvk\\pica_rvp_IF/\n", "15\n", "0\n", "[]\n", "path: ./../data/picaxml/rvk\\pica_rvp_IF/0_1_pica_rvp_IF.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IF/10_5001_pica_rvp_IF.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IF/11_5501_pica_rvp_IF.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IF/12_6001_pica_rvp_IF.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IF/13_6501_pica_rvp_IF.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IF/14_7001_pica_rvp_IF.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IF/1_501_pica_rvp_IF.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IF/2_1001_pica_rvp_IF.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IF/3_1501_pica_rvp_IF.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IF/4_2001_pica_rvp_IF.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IF/5_2501_pica_rvp_IF.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IF/6_3001_pica_rvp_IF.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IF/7_3501_pica_rvp_IF.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IF/8_4001_pica_rvp_IF.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IF/9_4501_pica_rvp_IF.picaxml\n", "[]\n", "./../data/picaxml/rvk\\pica_rvp_IG/\n", "/rvk\\pica_rvp_IG/\n", "41\n", "0\n", "[]\n", "path: ./../data/picaxml/rvk\\pica_rvp_IG/0_1_pica_rvp_IG.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IG/10_5001_pica_rvp_IG.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IG/11_5501_pica_rvp_IG.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IG/12_6001_pica_rvp_IG.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IG/13_6501_pica_rvp_IG.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IG/14_7001_pica_rvp_IG.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IG/15_7501_pica_rvp_IG.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IG/16_8001_pica_rvp_IG.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IG/17_8501_pica_rvp_IG.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IG/18_9001_pica_rvp_IG.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IG/19_9501_pica_rvp_IG.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IG/1_501_pica_rvp_IG.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IG/20_10001_pica_rvp_IG.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IG/21_10501_pica_rvp_IG.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IG/22_11001_pica_rvp_IG.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IG/23_11501_pica_rvp_IG.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IG/24_12001_pica_rvp_IG.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IG/25_12501_pica_rvp_IG.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IG/26_13001_pica_rvp_IG.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IG/27_13501_pica_rvp_IG.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IG/28_14001_pica_rvp_IG.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IG/29_14501_pica_rvp_IG.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IG/2_1001_pica_rvp_IG.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IG/30_15001_pica_rvp_IG.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IG/31_15501_pica_rvp_IG.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IG/32_16001_pica_rvp_IG.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IG/33_16501_pica_rvp_IG.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IG/34_17001_pica_rvp_IG.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IG/35_17501_pica_rvp_IG.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IG/36_18001_pica_rvp_IG.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IG/37_18501_pica_rvp_IG.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IG/38_19001_pica_rvp_IG.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IG/39_19501_pica_rvp_IG.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IG/3_1501_pica_rvp_IG.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IG/40_20001_pica_rvp_IG.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IG/4_2001_pica_rvp_IG.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IG/5_2501_pica_rvp_IG.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IG/6_3001_pica_rvp_IG.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IG/7_3501_pica_rvp_IG.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IG/8_4001_pica_rvp_IG.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IG/9_4501_pica_rvp_IG.picaxml\n", "[]\n", "./../data/picaxml/rvk\\pica_rvp_IH/\n", "/rvk\\pica_rvp_IH/\n", "75\n", "0\n", "[]\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/0_1_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/10_5001_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/11_5501_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/12_6001_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/13_6501_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/14_7001_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/15_7501_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/16_8001_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/17_8501_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/18_9001_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/19_9501_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/1_501_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/20_10001_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/21_10501_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/22_11001_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/23_11501_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/24_12001_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/25_12501_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/26_13001_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/27_13501_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/28_14001_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/29_14501_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/2_1001_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/30_15001_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/31_15501_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/32_16001_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/33_16501_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/34_17001_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/35_17501_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/36_18001_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/37_18501_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/38_19001_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/39_19501_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/3_1501_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/40_20001_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/41_20501_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/42_21001_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/43_21501_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/44_22001_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/45_22501_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/46_23001_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/47_23501_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/48_24001_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/49_24501_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/4_2001_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/50_25001_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/51_25501_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/52_26001_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/53_26501_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/54_27001_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/55_27501_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/56_28001_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/57_28501_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/58_29001_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/59_29501_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/5_2501_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/60_30001_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/61_30501_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/62_31001_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/63_31501_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/64_32001_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/65_32501_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/66_33001_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/67_33501_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/68_34001_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/69_34501_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/6_3001_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/70_35001_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/71_35501_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/72_36001_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/73_36501_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/74_37001_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/7_3501_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/8_4001_pica_rvp_IH.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IH/9_4501_pica_rvp_IH.picaxml\n", "[]\n", "./../data/picaxml/rvk\\pica_rvp_IJ/\n", "/rvk\\pica_rvp_IJ/\n", "18\n", "0\n", "[]\n", "path: ./../data/picaxml/rvk\\pica_rvp_IJ/0_1_pica_rvp_IJ.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IJ/10_5001_pica_rvp_IJ.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IJ/11_5501_pica_rvp_IJ.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IJ/12_6001_pica_rvp_IJ.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IJ/13_6501_pica_rvp_IJ.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IJ/14_7001_pica_rvp_IJ.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IJ/15_7501_pica_rvp_IJ.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IJ/16_8001_pica_rvp_IJ.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IJ/17_8501_pica_rvp_IJ.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IJ/1_501_pica_rvp_IJ.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IJ/2_1001_pica_rvp_IJ.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IJ/3_1501_pica_rvp_IJ.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IJ/4_2001_pica_rvp_IJ.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IJ/5_2501_pica_rvp_IJ.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IJ/6_3001_pica_rvp_IJ.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IJ/7_3501_pica_rvp_IJ.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IJ/8_4001_pica_rvp_IJ.picaxml\n", "path: ./../data/picaxml/rvk\\pica_rvp_IJ/9_4501_pica_rvp_IJ.picaxml\n", "[]\n", "./../data/picaxml/rvk\\pica_rvp_IK/\n", "/rvk\\pica_rvp_IK/\n", "3\n", "0\n", "[]\n", "path: ./../data/picaxml/rvk\\pica_rvp_IK/0_1_pica_rvp_IK.picaxml\n" ] } ], "source": [ "for wdir in glob.glob(\"./../data/picaxml/rvk/*\"):\n", " \n", " wdir = wdir + \"/\"\n", " print(wdir)\n", " start_time = time.time()\n", "\n", " transform_formats.from_picaxml_with_multiple_entries_to_parquet(\n", " wdir = wdir ,\n", " ending = \".picaxml\",\n", " xpaths_fields = xpaths_fields,\n", " xpath_entry = '/srw:searchRetrieveResponse/srw:records/srw:record/srw:recordData/record',\n", " namespaces = {'srw' : \"http://www.loc.gov/zing/srw/\"},\n", " outdir = \"./../data/parquets/\",\n", " wait_if_existing = False,\n", " )" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "interpreter": { "hash": "960f4d658457e0296618a96ff118bd2cbd6b6fed4373572a098ed4ec5a9be4c3" }, "kernelspec": { "display_name": "Python 3.7.6 64-bit ('base': conda)", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 2 }