{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from lxml import etree\n", "import requests\n", "import os\n", "import glob\n", "import re\n", "import numpy as np\n", "import sys" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "sys.path.append(os.path.abspath(\"C:/Users/calvotello/Dropbox/MTB/Göttingen/research/\"))\n", "\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "from librarian_robot import transform_formats, basic_functions\n", "\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "concat_df = pd.read_parquet(\"./../data/parquets/pica_rvp_IA-IZ_pica_sgt_440_860.parquet\")" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
indexppnmediumtitletitle_supplementyearentry_firstauthor_first_nameauthor_last_nameauthor_gnd_id...keyword_BDSL_pkeyword_BDSL_tkeyword_BDSL_skeyword_Fremddatenlieferanten_lieferantenkeyword_Fremddatenlieferantenlcc_notationsignatur_placesignatursignatur_dateapi_query
00484825402OaxLes @Arts et les Lettres en Provence au temps ...None20206055:27-08-21JeanArrouyeNone...NaNNaNNaNNaNf Kongress|g Aix-en-Provence <2009>NoneNoneNone27-08-21|27-08-21|27-08-21|27-08-21|27-08-21|2...pica_rvp_IA
11469727594AauMédiations et construction de l'Antiquité dans...None20200026:22-09-20NoneNoneNone...NaNNaNNaNNaNNoneNone000|11291.220.68|Za 195 (101)30-11-20|22-09-20pica_rvp_IA
22466221029AbvcLire magazine littéraireNone20206050:02-07-20NoneNoneNone...NaNNaNNaNNaNNoneNone330|022|019|07401/IA 6450|28 Rom Z 15707|Z 603|Z 87209-08-01|11-08-20|05-08-20|16-12-20pica_rvp_IA
33450873676AbvcFrancophonies du mondeNone20196050:18-07-19NoneNoneNone...NaNNaNNaNNaNNoneNone330|02216/IA 5021a|28 Frz Z 1775[Suppl.14-06-07|24-07-19pica_rvp_IA
44442132174AauLe @\"théâtre provincial\" en France(XVIe-XVIIIe siècle)20180026:31-01-19NoneNoneNone...NaNNaNNaNNaNNoneNone000|112291.535|Za 195 (97)19-09-19|31-01-19pica_rvp_IA
..................................................................
1897587448323834OaxAvantgarde und RevolutionMexikanische Lyrik von López Velarde bis Octav...19876055:07-05-19NoneNoneNone...NaNNaNNaNNaNNoneNoneNoneNone16-11-20|09-05-19|24-09-19|07-02-20pica_sgt_860
18975980363749462AauIn einer fremden StadtGedichte und Fragmente19836000:03-09-15NoneNone363316256...NaNNaNNaNNaNNoneNone112P 2 19 MENE a 1983/126-05-21pica_sgt_860
18976010344832184XOaxDie @Frau im spanischen Roman nach dem Bürgerk...Camilo José Cela, Carmen Laforet, Ana María Ma...19826055:07-05-19SylviaTruxaNone...NaNNaNNaNNaNNoneNoneNoneNone16-11-20|09-05-19|24-09-19|07-02-20pica_sgt_860
189761244047829680AbvcLateinamerika-StudienNone19766050:20-06-96NoneNoneNone...NaNNaNNaNNaNNoneNone000|000|000|002|000|000Zs 14176|83.144.48|83.896.55|AZ 161|SS 78/43|0...06-10-98|06-10-98|16-11-06|26-04-06|07-10-98|0...pica_sgt_860
189762473386319588OauDer @Troubadour der spanischen FalangeNone19656000:27-08-16NoneNone140900845...NaNNaNNaNNaNNoneNone900/24-09-19|21-11-16|24-09-19|24-09-19|24-09-19|2...pica_sgt_860
\n", "

189763 rows × 48 columns

\n", "
" ], "text/plain": [ " index ppn medium \\\n", "0 0 484825402 Oax \n", "1 1 469727594 Aau \n", "2 2 466221029 Abvc \n", "3 3 450873676 Abvc \n", "4 4 442132174 Aau \n", "... ... ... ... \n", "189758 7 448323834 Oax \n", "189759 80 363749462 Aau \n", "189760 103 44832184X Oax \n", "189761 244 047829680 Abvc \n", "189762 473 386319588 Oau \n", "\n", " title \\\n", "0 Les @Arts et les Lettres en Provence au temps ... \n", "1 Médiations et construction de l'Antiquité dans... \n", "2 Lire magazine littéraire \n", "3 Francophonies du monde \n", "4 Le @\"théâtre provincial\" en France \n", "... ... \n", "189758 Avantgarde und Revolution \n", "189759 In einer fremden Stadt \n", "189760 Die @Frau im spanischen Roman nach dem Bürgerk... \n", "189761 Lateinamerika-Studien \n", "189762 Der @Troubadour der spanischen Falange \n", "\n", " title_supplement year \\\n", "0 None 2020 \n", "1 None 2020 \n", "2 None 2020 \n", "3 None 2019 \n", "4 (XVIe-XVIIIe siècle) 2018 \n", "... ... ... \n", "189758 Mexikanische Lyrik von López Velarde bis Octav... 1987 \n", "189759 Gedichte und Fragmente 1983 \n", "189760 Camilo José Cela, Carmen Laforet, Ana María Ma... 1982 \n", "189761 None 1976 \n", "189762 None 1965 \n", "\n", " entry_first author_first_name author_last_name author_gnd_id ... \\\n", "0 6055:27-08-21 Jean Arrouye None ... \n", "1 0026:22-09-20 None None None ... \n", "2 6050:02-07-20 None None None ... \n", "3 6050:18-07-19 None None None ... \n", "4 0026:31-01-19 None None None ... \n", "... ... ... ... ... ... \n", "189758 6055:07-05-19 None None None ... \n", "189759 6000:03-09-15 None None 363316256 ... \n", "189760 6055:07-05-19 Sylvia Truxa None ... \n", "189761 6050:20-06-96 None None None ... \n", "189762 6000:27-08-16 None None 140900845 ... \n", "\n", " keyword_BDSL_p keyword_BDSL_t keyword_BDSL_s \\\n", "0 NaN NaN NaN \n", "1 NaN NaN NaN \n", "2 NaN NaN NaN \n", "3 NaN NaN NaN \n", "4 NaN NaN NaN \n", "... ... ... ... \n", "189758 NaN NaN NaN \n", "189759 NaN NaN NaN \n", "189760 NaN NaN NaN \n", "189761 NaN NaN NaN \n", "189762 NaN NaN NaN \n", "\n", " keyword_Fremddatenlieferanten_lieferanten \\\n", "0 NaN \n", "1 NaN \n", "2 NaN \n", "3 NaN \n", "4 NaN \n", "... ... \n", "189758 NaN \n", "189759 NaN \n", "189760 NaN \n", "189761 NaN \n", "189762 NaN \n", "\n", " keyword_Fremddatenlieferanten lcc_notation \\\n", "0 f Kongress|g Aix-en-Provence <2009> None \n", "1 None None \n", "2 None None \n", "3 None None \n", "4 None None \n", "... ... ... \n", "189758 None None \n", "189759 None None \n", "189760 None None \n", "189761 None None \n", "189762 None None \n", "\n", " signatur_place \\\n", "0 None \n", "1 000|112 \n", "2 330|022|019|074 \n", "3 330|022 \n", "4 000|112 \n", "... ... \n", "189758 None \n", "189759 112 \n", "189760 None \n", "189761 000|000|000|002|000|000 \n", "189762 900 \n", "\n", " signatur \\\n", "0 None \n", "1 91.220.68|Za 195 (101) \n", "2 01/IA 6450|28 Rom Z 15707|Z 603|Z 872 \n", "3 16/IA 5021a|28 Frz Z 1775[Suppl. \n", "4 291.535|Za 195 (97) \n", "... ... \n", "189758 None \n", "189759 P 2 19 MENE a 1983/1 \n", "189760 None \n", "189761 Zs 14176|83.144.48|83.896.55|AZ 161|SS 78/43|0... \n", "189762 / \n", "\n", " signatur_date api_query \n", "0 27-08-21|27-08-21|27-08-21|27-08-21|27-08-21|2... pica_rvp_IA \n", "1 30-11-20|22-09-20 pica_rvp_IA \n", "2 09-08-01|11-08-20|05-08-20|16-12-20 pica_rvp_IA \n", "3 14-06-07|24-07-19 pica_rvp_IA \n", "4 19-09-19|31-01-19 pica_rvp_IA \n", "... ... ... \n", "189758 16-11-20|09-05-19|24-09-19|07-02-20 pica_sgt_860 \n", "189759 26-05-21 pica_sgt_860 \n", "189760 16-11-20|09-05-19|24-09-19|07-02-20 pica_sgt_860 \n", "189761 06-10-98|06-10-98|16-11-06|26-04-06|07-10-98|0... pica_sgt_860 \n", "189762 24-09-19|21-11-16|24-09-19|24-09-19|24-09-19|2... pica_sgt_860 \n", "\n", "[189763 rows x 48 columns]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "concat_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Year" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "concat_df[\"year_publication\"] = concat_df[\"year\"].str.findall(\"\\d\\d\\d\\d\").str.get(0)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "concat_df = concat_df.loc[concat_df[\"year_publication\"].notna()]" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\calvotello\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " \"\"\"Entry point for launching an IPython kernel.\n" ] } ], "source": [ "concat_df[\"year_publication\"] = concat_df[\"year_publication\"].astype(int)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 2020\n", "1 2020\n", "2 2020\n", "3 2019\n", "4 2018\n", " ... \n", "189758 1987\n", "189759 1983\n", "189760 1982\n", "189761 1976\n", "189762 1965\n", "Name: year_publication, Length: 186879, dtype: int32" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "concat_df[\"year_publication\"]" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "concat_df = concat_df.loc[(concat_df[\"year_publication\"] >= 1980 ) & (concat_df[\"year_publication\"] < 2020 ) ]" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
indexppnmediumtitletitle_supplementyearentry_firstauthor_first_nameauthor_last_nameauthor_gnd_id...keyword_BDSL_tkeyword_BDSL_skeyword_Fremddatenlieferanten_lieferantenkeyword_Fremddatenlieferantenlcc_notationsignatur_placesignatursignatur_dateapi_queryyear_publication
33450873676AbvcFrancophonies du mondeNone20196050:18-07-19NoneNoneNone...NaNNaNNaNNoneNone330|02216/IA 5021a|28 Frz Z 1775[Suppl.14-06-07|24-07-19pica_rvp_IA2019
44442132174AauLe @\"théâtre provincial\" en France(XVIe-XVIIIe siècle)20180026:31-01-19NoneNoneNone...NaNNaNNaNNoneNone000|112291.535|Za 195 (97)19-09-19|31-01-19pica_rvp_IA2018
55438697448AauModernités des troubadoursNone20180077:13-11-18NoneNoneNone...NaNNaNNaNNonePN56.T768000|00091.282.92|288.69112-10-21|11-12-18pica_rvp_IA2018
66424315815AbvcLe @nouveau magazine littéraireNone20186050:29-12-17NoneNoneNone...NaNNaNNaNNoneNone330|022|07401/IA 6450|28 Rom Z 15592|Z 87124-07-20|10-01-18|08-01-18pica_rvp_IA2018
77481184937OauCourage de la vérité et écritures de l'histoire(XVIe-XVIIIe siècle)20170026:05-07-21NoneNoneNone...NaNNaNNaNNoneNoneNoneNone05-07-21|05-07-21|05-07-21|05-07-21|05-07-21|0...pica_rvp_IA2017
..................................................................
1897560448323850OaxCalderónFremdheit und Nähe eines spanischen Barockdram...19886055:07-05-19NoneNoneNone...NaNNaNNaNNoneNoneNoneNone16-11-20|09-05-19|24-09-19|07-02-20pica_sgt_8601988
1897576448323842OaxDein Körper neben mirGedichte Zweisprachige Ausgabe19876055:07-05-19JaimeSabinesNone...NaNNaNNaNNoneNoneNoneNone16-11-20|09-05-19|24-09-19|07-02-20pica_sgt_8601987
1897587448323834OaxAvantgarde und RevolutionMexikanische Lyrik von López Velarde bis Octav...19876055:07-05-19NoneNoneNone...NaNNaNNaNNoneNoneNoneNone16-11-20|09-05-19|24-09-19|07-02-20pica_sgt_8601987
18975980363749462AauIn einer fremden StadtGedichte und Fragmente19836000:03-09-15NoneNone363316256...NaNNaNNaNNoneNone112P 2 19 MENE a 1983/126-05-21pica_sgt_8601983
18976010344832184XOaxDie @Frau im spanischen Roman nach dem Bürgerk...Camilo José Cela, Carmen Laforet, Ana María Ma...19826055:07-05-19SylviaTruxaNone...NaNNaNNaNNoneNoneNoneNone16-11-20|09-05-19|24-09-19|07-02-20pica_sgt_8601982
\n", "

122444 rows × 49 columns

\n", "
" ], "text/plain": [ " index ppn medium \\\n", "3 3 450873676 Abvc \n", "4 4 442132174 Aau \n", "5 5 438697448 Aau \n", "6 6 424315815 Abvc \n", "7 7 481184937 Oau \n", "... ... ... ... \n", "189756 0 448323850 Oax \n", "189757 6 448323842 Oax \n", "189758 7 448323834 Oax \n", "189759 80 363749462 Aau \n", "189760 103 44832184X Oax \n", "\n", " title \\\n", "3 Francophonies du monde \n", "4 Le @\"théâtre provincial\" en France \n", "5 Modernités des troubadours \n", "6 Le @nouveau magazine littéraire \n", "7 Courage de la vérité et écritures de l'histoire \n", "... ... \n", "189756 Calderón \n", "189757 Dein Körper neben mir \n", "189758 Avantgarde und Revolution \n", "189759 In einer fremden Stadt \n", "189760 Die @Frau im spanischen Roman nach dem Bürgerk... \n", "\n", " title_supplement year \\\n", "3 None 2019 \n", "4 (XVIe-XVIIIe siècle) 2018 \n", "5 None 2018 \n", "6 None 2018 \n", "7 (XVIe-XVIIIe siècle) 2017 \n", "... ... ... \n", "189756 Fremdheit und Nähe eines spanischen Barockdram... 1988 \n", "189757 Gedichte Zweisprachige Ausgabe 1987 \n", "189758 Mexikanische Lyrik von López Velarde bis Octav... 1987 \n", "189759 Gedichte und Fragmente 1983 \n", "189760 Camilo José Cela, Carmen Laforet, Ana María Ma... 1982 \n", "\n", " entry_first author_first_name author_last_name author_gnd_id ... \\\n", "3 6050:18-07-19 None None None ... \n", "4 0026:31-01-19 None None None ... \n", "5 0077:13-11-18 None None None ... \n", "6 6050:29-12-17 None None None ... \n", "7 0026:05-07-21 None None None ... \n", "... ... ... ... ... ... \n", "189756 6055:07-05-19 None None None ... \n", "189757 6055:07-05-19 Jaime Sabines None ... \n", "189758 6055:07-05-19 None None None ... \n", "189759 6000:03-09-15 None None 363316256 ... \n", "189760 6055:07-05-19 Sylvia Truxa None ... \n", "\n", " keyword_BDSL_t keyword_BDSL_s \\\n", "3 NaN NaN \n", "4 NaN NaN \n", "5 NaN NaN \n", "6 NaN NaN \n", "7 NaN NaN \n", "... ... ... \n", "189756 NaN NaN \n", "189757 NaN NaN \n", "189758 NaN NaN \n", "189759 NaN NaN \n", "189760 NaN NaN \n", "\n", " keyword_Fremddatenlieferanten_lieferanten \\\n", "3 NaN \n", "4 NaN \n", "5 NaN \n", "6 NaN \n", "7 NaN \n", "... ... \n", "189756 NaN \n", "189757 NaN \n", "189758 NaN \n", "189759 NaN \n", "189760 NaN \n", "\n", " keyword_Fremddatenlieferanten lcc_notation signatur_place \\\n", "3 None None 330|022 \n", "4 None None 000|112 \n", "5 None PN56.T768 000|000 \n", "6 None None 330|022|074 \n", "7 None None None \n", "... ... ... ... \n", "189756 None None None \n", "189757 None None None \n", "189758 None None None \n", "189759 None None 112 \n", "189760 None None None \n", "\n", " signatur \\\n", "3 16/IA 5021a|28 Frz Z 1775[Suppl. \n", "4 291.535|Za 195 (97) \n", "5 91.282.92|288.691 \n", "6 01/IA 6450|28 Rom Z 15592|Z 871 \n", "7 None \n", "... ... \n", "189756 None \n", "189757 None \n", "189758 None \n", "189759 P 2 19 MENE a 1983/1 \n", "189760 None \n", "\n", " signatur_date api_query \\\n", "3 14-06-07|24-07-19 pica_rvp_IA \n", "4 19-09-19|31-01-19 pica_rvp_IA \n", "5 12-10-21|11-12-18 pica_rvp_IA \n", "6 24-07-20|10-01-18|08-01-18 pica_rvp_IA \n", "7 05-07-21|05-07-21|05-07-21|05-07-21|05-07-21|0... pica_rvp_IA \n", "... ... ... \n", "189756 16-11-20|09-05-19|24-09-19|07-02-20 pica_sgt_860 \n", "189757 16-11-20|09-05-19|24-09-19|07-02-20 pica_sgt_860 \n", "189758 16-11-20|09-05-19|24-09-19|07-02-20 pica_sgt_860 \n", "189759 26-05-21 pica_sgt_860 \n", "189760 16-11-20|09-05-19|24-09-19|07-02-20 pica_sgt_860 \n", "\n", " year_publication \n", "3 2019 \n", "4 2018 \n", "5 2018 \n", "6 2018 \n", "7 2017 \n", "... ... \n", "189756 1988 \n", "189757 1987 \n", "189758 1987 \n", "189759 1983 \n", "189760 1982 \n", "\n", "[122444 rows x 49 columns]" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "concat_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Primary Literature" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Sachgruppe" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['index',\n", " 'ppn',\n", " 'medium',\n", " 'title',\n", " 'title_supplement',\n", " 'year',\n", " 'entry_first',\n", " 'author_first_name',\n", " 'author_last_name',\n", " 'author_gnd_id',\n", " 'editor_first_name',\n", " 'editor_last_name',\n", " 'editor_gnd_id',\n", " 'isbn',\n", " 'ILNs',\n", " 'content_type_ppn',\n", " 'content_type',\n", " 'publisher',\n", " 'language_text',\n", " 'pages',\n", " 'comment_isbn',\n", " 'place_publication',\n", " 'summary',\n", " 'title_continuing_resource',\n", " 'work_ppn',\n", " 'work_info',\n", " 'work_title',\n", " 'DDC_notation',\n", " 'DDC_sachgruppe_a',\n", " 'DDC_sachgruppe_b',\n", " 'DDC_sachgruppe_c',\n", " 'RVK_ppn',\n", " 'RVK_notation',\n", " 'keyword_fremd_ppn',\n", " 'keyword_fremd',\n", " 'keyword_einzel_ppn',\n", " 'keyword_einzel',\n", " 'keyword_BDSL_a',\n", " 'keyword_BDSL_p',\n", " 'keyword_BDSL_t',\n", " 'keyword_BDSL_s',\n", " 'keyword_Fremddatenlieferanten_lieferanten',\n", " 'keyword_Fremddatenlieferanten',\n", " 'lcc_notation',\n", " 'signatur_place',\n", " 'signatur',\n", " 'signatur_date',\n", " 'api_query',\n", " 'year_publication']" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "concat_df.columns.tolist()" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "concat_df = concat_df.loc[~concat_df[\"DDC_sachgruppe_c\"].fillna(\"\").str.contains(\"B\")]" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DDC_sachgruppe_c
2206B|840
2388B|840
2600860|B
17057840|B
21550840|B
......
189714860|B
189716860|B
189721860|B
189722860|B
189759860|B
\n", "

5426 rows × 1 columns

\n", "
" ], "text/plain": [ " DDC_sachgruppe_c\n", "2206 B|840\n", "2388 B|840\n", "2600 860|B\n", "17057 840|B\n", "21550 840|B\n", "... ...\n", "189714 860|B\n", "189716 860|B\n", "189721 860|B\n", "189722 860|B\n", "189759 860|B\n", "\n", "[5426 rows x 1 columns]" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "concat_df.loc[concat_df[\"DDC_sachgruppe_c\"].fillna(\"\").str.contains(\"B\")][[\"DDC_sachgruppe_c\"]]" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "117018" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "concat_df.shape[0]- 5426" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
indexppnmediumtitletitle_supplementyearentry_firstauthor_first_nameauthor_last_nameauthor_gnd_id...keyword_BDSL_tkeyword_BDSL_skeyword_Fremddatenlieferanten_lieferantenkeyword_Fremddatenlieferantenlcc_notationsignatur_placesignatursignatur_dateapi_queryyear_publication
33450873676AbvcFrancophonies du mondeNone20196050:18-07-19NoneNoneNone...NaNNaNNaNNoneNone330|02216/IA 5021a|28 Frz Z 1775[Suppl.14-06-07|24-07-19pica_rvp_IA2019
44442132174AauLe @\"théâtre provincial\" en France(XVIe-XVIIIe siècle)20180026:31-01-19NoneNoneNone...NaNNaNNaNNoneNone000|112291.535|Za 195 (97)19-09-19|31-01-19pica_rvp_IA2018
55438697448AauModernités des troubadoursNone20180077:13-11-18NoneNoneNone...NaNNaNNaNNonePN56.T768000|00091.282.92|288.69112-10-21|11-12-18pica_rvp_IA2018
66424315815AbvcLe @nouveau magazine littéraireNone20186050:29-12-17NoneNoneNone...NaNNaNNaNNoneNone330|022|07401/IA 6450|28 Rom Z 15592|Z 87124-07-20|10-01-18|08-01-18pica_rvp_IA2018
77481184937OauCourage de la vérité et écritures de l'histoire(XVIe-XVIIIe siècle)20170026:05-07-21NoneNoneNone...NaNNaNNaNNoneNoneNoneNone05-07-21|05-07-21|05-07-21|05-07-21|05-07-21|0...pica_rvp_IA2017
..................................................................
189755499448323869OaxKRTU und andere ProsadichtungenZweisprachige Ausgabe mit einem Nachwort von E...19886055:07-05-19Josep VicençFoixNone...NaNNaNNaNNoneNoneNoneNone16-11-20|09-05-19|24-09-19|07-02-20pica_sgt_8601988
1897560448323850OaxCalderónFremdheit und Nähe eines spanischen Barockdram...19886055:07-05-19NoneNoneNone...NaNNaNNaNNoneNoneNoneNone16-11-20|09-05-19|24-09-19|07-02-20pica_sgt_8601988
1897576448323842OaxDein Körper neben mirGedichte Zweisprachige Ausgabe19876055:07-05-19JaimeSabinesNone...NaNNaNNaNNoneNoneNoneNone16-11-20|09-05-19|24-09-19|07-02-20pica_sgt_8601987
1897587448323834OaxAvantgarde und RevolutionMexikanische Lyrik von López Velarde bis Octav...19876055:07-05-19NoneNoneNone...NaNNaNNaNNoneNoneNoneNone16-11-20|09-05-19|24-09-19|07-02-20pica_sgt_8601987
18976010344832184XOaxDie @Frau im spanischen Roman nach dem Bürgerk...Camilo José Cela, Carmen Laforet, Ana María Ma...19826055:07-05-19SylviaTruxaNone...NaNNaNNaNNoneNoneNoneNone16-11-20|09-05-19|24-09-19|07-02-20pica_sgt_8601982
\n", "

117018 rows × 49 columns

\n", "
" ], "text/plain": [ " index ppn medium \\\n", "3 3 450873676 Abvc \n", "4 4 442132174 Aau \n", "5 5 438697448 Aau \n", "6 6 424315815 Abvc \n", "7 7 481184937 Oau \n", "... ... ... ... \n", "189755 499 448323869 Oax \n", "189756 0 448323850 Oax \n", "189757 6 448323842 Oax \n", "189758 7 448323834 Oax \n", "189760 103 44832184X Oax \n", "\n", " title \\\n", "3 Francophonies du monde \n", "4 Le @\"théâtre provincial\" en France \n", "5 Modernités des troubadours \n", "6 Le @nouveau magazine littéraire \n", "7 Courage de la vérité et écritures de l'histoire \n", "... ... \n", "189755 KRTU und andere Prosadichtungen \n", "189756 Calderón \n", "189757 Dein Körper neben mir \n", "189758 Avantgarde und Revolution \n", "189760 Die @Frau im spanischen Roman nach dem Bürgerk... \n", "\n", " title_supplement year \\\n", "3 None 2019 \n", "4 (XVIe-XVIIIe siècle) 2018 \n", "5 None 2018 \n", "6 None 2018 \n", "7 (XVIe-XVIIIe siècle) 2017 \n", "... ... ... \n", "189755 Zweisprachige Ausgabe mit einem Nachwort von E... 1988 \n", "189756 Fremdheit und Nähe eines spanischen Barockdram... 1988 \n", "189757 Gedichte Zweisprachige Ausgabe 1987 \n", "189758 Mexikanische Lyrik von López Velarde bis Octav... 1987 \n", "189760 Camilo José Cela, Carmen Laforet, Ana María Ma... 1982 \n", "\n", " entry_first author_first_name author_last_name author_gnd_id ... \\\n", "3 6050:18-07-19 None None None ... \n", "4 0026:31-01-19 None None None ... \n", "5 0077:13-11-18 None None None ... \n", "6 6050:29-12-17 None None None ... \n", "7 0026:05-07-21 None None None ... \n", "... ... ... ... ... ... \n", "189755 6055:07-05-19 Josep Vicenç Foix None ... \n", "189756 6055:07-05-19 None None None ... \n", "189757 6055:07-05-19 Jaime Sabines None ... \n", "189758 6055:07-05-19 None None None ... \n", "189760 6055:07-05-19 Sylvia Truxa None ... \n", "\n", " keyword_BDSL_t keyword_BDSL_s \\\n", "3 NaN NaN \n", "4 NaN NaN \n", "5 NaN NaN \n", "6 NaN NaN \n", "7 NaN NaN \n", "... ... ... \n", "189755 NaN NaN \n", "189756 NaN NaN \n", "189757 NaN NaN \n", "189758 NaN NaN \n", "189760 NaN NaN \n", "\n", " keyword_Fremddatenlieferanten_lieferanten \\\n", "3 NaN \n", "4 NaN \n", "5 NaN \n", "6 NaN \n", "7 NaN \n", "... ... \n", "189755 NaN \n", "189756 NaN \n", "189757 NaN \n", "189758 NaN \n", "189760 NaN \n", "\n", " keyword_Fremddatenlieferanten lcc_notation signatur_place \\\n", "3 None None 330|022 \n", "4 None None 000|112 \n", "5 None PN56.T768 000|000 \n", "6 None None 330|022|074 \n", "7 None None None \n", "... ... ... ... \n", "189755 None None None \n", "189756 None None None \n", "189757 None None None \n", "189758 None None None \n", "189760 None None None \n", "\n", " signatur \\\n", "3 16/IA 5021a|28 Frz Z 1775[Suppl. \n", "4 291.535|Za 195 (97) \n", "5 91.282.92|288.691 \n", "6 01/IA 6450|28 Rom Z 15592|Z 871 \n", "7 None \n", "... ... \n", "189755 None \n", "189756 None \n", "189757 None \n", "189758 None \n", "189760 None \n", "\n", " signatur_date api_query \\\n", "3 14-06-07|24-07-19 pica_rvp_IA \n", "4 19-09-19|31-01-19 pica_rvp_IA \n", "5 12-10-21|11-12-18 pica_rvp_IA \n", "6 24-07-20|10-01-18|08-01-18 pica_rvp_IA \n", "7 05-07-21|05-07-21|05-07-21|05-07-21|05-07-21|0... pica_rvp_IA \n", "... ... ... \n", "189755 16-11-20|09-05-19|24-09-19|07-02-20 pica_sgt_860 \n", "189756 16-11-20|09-05-19|24-09-19|07-02-20 pica_sgt_860 \n", "189757 16-11-20|09-05-19|24-09-19|07-02-20 pica_sgt_860 \n", "189758 16-11-20|09-05-19|24-09-19|07-02-20 pica_sgt_860 \n", "189760 16-11-20|09-05-19|24-09-19|07-02-20 pica_sgt_860 \n", "\n", " year_publication \n", "3 2019 \n", "4 2018 \n", "5 2018 \n", "6 2018 \n", "7 2017 \n", "... ... \n", "189755 1988 \n", "189756 1988 \n", "189757 1987 \n", "189758 1987 \n", "189760 1982 \n", "\n", "[117018 rows x 49 columns]" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "concat_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## RVK" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['index',\n", " 'ppn',\n", " 'medium',\n", " 'title',\n", " 'title_supplement',\n", " 'year',\n", " 'entry_first',\n", " 'author_first_name',\n", " 'author_last_name',\n", " 'author_gnd_id',\n", " 'editor_first_name',\n", " 'editor_last_name',\n", " 'editor_gnd_id',\n", " 'isbn',\n", " 'ILNs',\n", " 'content_type_ppn',\n", " 'content_type',\n", " 'publisher',\n", " 'language_text',\n", " 'pages',\n", " 'comment_isbn',\n", " 'place_publication',\n", " 'summary',\n", " 'title_continuing_resource',\n", " 'work_ppn',\n", " 'work_info',\n", " 'work_title',\n", " 'DDC_notation',\n", " 'DDC_sachgruppe_a',\n", " 'DDC_sachgruppe_b',\n", " 'DDC_sachgruppe_c',\n", " 'RVK_ppn',\n", " 'RVK_notation',\n", " 'keyword_fremd_ppn',\n", " 'keyword_fremd',\n", " 'keyword_einzel_ppn',\n", " 'keyword_einzel',\n", " 'keyword_BDSL_a',\n", " 'keyword_BDSL_p',\n", " 'keyword_BDSL_t',\n", " 'keyword_BDSL_s',\n", " 'keyword_Fremddatenlieferanten_lieferanten',\n", " 'keyword_Fremddatenlieferanten',\n", " 'lcc_notation',\n", " 'signatur_place',\n", " 'signatur',\n", " 'signatur_date',\n", " 'api_query',\n", " 'year_publication']" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "concat_df.columns.tolist()" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "3 IA 1000|Romanistik||Zeitschriften\n", "4 IA 1000|Romanistik||Zeitschriften|IE 3155|Roma...\n", "5 IA 1000|Romanistik||Zeitschriften|IK 6510|Roma...\n", "6 IA 1000|Romanistik||Zeitschriften\n", "7 IE 1234|Romanistik||Französische Literatur i.A...\n", " ... \n", "189755 None\n", "189756 None\n", "189757 None\n", "189758 None\n", "189760 None\n", "Name: RVK_notation, Length: 117018, dtype: object" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "concat_df[\"RVK_notation\"]" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "rvk_df = pd.read_csv(\"./../data/rvk.tsv\", sep=\"\\t\", index_col=0)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
notationbenennungcontentregistersiparent_notationparent_benennunghaupt_notation_1haupt_notation_2
0AAllgemeinesNaNNaN0NaNNaNAA
1AABibliografien der Bibliografien, Universalbibl...NaNNaN1AAllgemeinesAAA
2AA 09900Bibliografische ZeitschriftenErläuterungen zur Notationsvergabe s. RVK-Onli...Bibliografie|Zeitschrift2AABibliografien der Bibliografien, Universalbibl...AAA
3AA 10000 - AA 19900Bibliografien der Bibliografien(aber nicht Bibliographien der Länder- und Fac...Bibliografie3AABibliografien der Bibliografien, Universalbibl...AAA
4AA 10000International, AllgemeinesNaNNaN4AA 10000 - AA 19900Bibliografien der BibliografienAAA
..............................
839359ZY 9855Audiovisuelle Medien(Schallplatten, Filme usw.)NaN839359ZY 9850 - ZY 9859KartenspieleZZY
839360ZY 9856MeisterschaftenNaNNaN839360ZY 9850 - ZY 9859KartenspieleZZY
839361ZY 9857KongresseNaNNaN839361ZY 9850 - ZY 9859KartenspieleZZY
839362ZY 9858Führer; Karten(CSN der Region, dazu CSN des Autors)NaN839362ZY 9850 - ZY 9859KartenspieleZZY
839363ZY 9859Sonstiges(z.B. Therapie)NaN839363ZY 9850 - ZY 9859KartenspieleZZY
\n", "

839364 rows × 9 columns

\n", "
" ], "text/plain": [ " notation \\\n", "0 A \n", "1 AA \n", "2 AA 09900 \n", "3 AA 10000 - AA 19900 \n", "4 AA 10000 \n", "... ... \n", "839359 ZY 9855 \n", "839360 ZY 9856 \n", "839361 ZY 9857 \n", "839362 ZY 9858 \n", "839363 ZY 9859 \n", "\n", " benennung \\\n", "0 Allgemeines \n", "1 Bibliografien der Bibliografien, Universalbibl... \n", "2 Bibliografische Zeitschriften \n", "3 Bibliografien der Bibliografien \n", "4 International, Allgemeines \n", "... ... \n", "839359 Audiovisuelle Medien \n", "839360 Meisterschaften \n", "839361 Kongresse \n", "839362 Führer; Karten \n", "839363 Sonstiges \n", "\n", " content \\\n", "0 NaN \n", "1 NaN \n", "2 Erläuterungen zur Notationsvergabe s. RVK-Onli... \n", "3 (aber nicht Bibliographien der Länder- und Fac... \n", "4 NaN \n", "... ... \n", "839359 (Schallplatten, Filme usw.) \n", "839360 NaN \n", "839361 NaN \n", "839362 (CSN der Region, dazu CSN des Autors) \n", "839363 (z.B. Therapie) \n", "\n", " registers i parent_notation \\\n", "0 NaN 0 NaN \n", "1 NaN 1 A \n", "2 Bibliografie|Zeitschrift 2 AA \n", "3 Bibliografie 3 AA \n", "4 NaN 4 AA 10000 - AA 19900 \n", "... ... ... ... \n", "839359 NaN 839359 ZY 9850 - ZY 9859 \n", "839360 NaN 839360 ZY 9850 - ZY 9859 \n", "839361 NaN 839361 ZY 9850 - ZY 9859 \n", "839362 NaN 839362 ZY 9850 - ZY 9859 \n", "839363 NaN 839363 ZY 9850 - ZY 9859 \n", "\n", " parent_benennung haupt_notation_1 \\\n", "0 NaN A \n", "1 Allgemeines A \n", "2 Bibliografien der Bibliografien, Universalbibl... A \n", "3 Bibliografien der Bibliografien, Universalbibl... A \n", "4 Bibliografien der Bibliografien A \n", "... ... ... \n", "839359 Kartenspiele Z \n", "839360 Kartenspiele Z \n", "839361 Kartenspiele Z \n", "839362 Kartenspiele Z \n", "839363 Kartenspiele Z \n", "\n", " haupt_notation_2 \n", "0 A \n", "1 AA \n", "2 AA \n", "3 AA \n", "4 AA \n", "... ... \n", "839359 ZY \n", "839360 ZY \n", "839361 ZY \n", "839362 ZY \n", "839363 ZY \n", "\n", "[839364 rows x 9 columns]" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "rvk_df" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
notationbenennungcontentregistersiparent_notationparent_benennunghaupt_notation_1haupt_notation_2
225139IE 5100Gesammelte WerkeNaNNaN225139IE 5100 - IE 5105Adam (de la Halle)IIE
225140IE 5101EinzelwerkeNaNNaN225140IE 5100 - IE 5105Adam (de la Halle)IIE
225143IE 5107Gesammelte Werke und EinzelwerkeNaNNaN225143IE 5107 - IE 5108Audefroi (le Bastard)IIE
225147IE 5112Gesammelte Werke und EinzelwerkeNaNNaN225147IE 5112 - IE 5113Baude, HenriIIE
225151IE 5143Gesammelte Werke und EinzelwerkeNaNNaN225151IE 5143 - IE 5144Busnois, AntoineIIE
..............................
257594IZ 9665Gesammelte Werke und EinzelwerkeNaNNaN257594IZ 9665 - IZ 9666Rodriguez Castelao, AlfonsoIIZ
257597IZ 9705Gesammelte Werke und EinzelwerkeNaNNaN257597IZ 9705 - IZ 9706Toro, Suso deIIZ
257600IZ 9707Gesammelte Werke und EinzelwerkeNaNNaN257600IZ 9707 - IZ 9708Toro, Xelís deIIZ
257603IZ 9831Gesammelte Werke und EinzelwerkeNaNNaN257603IZ 9831 - IZ 9832Vázquez, PuraIIZ
257606IZ 9841Gesammelte Werke und EinzelwerkeNaNNaN257606IZ 9841 - IZ 9842Villar, DomingoIIZ
\n", "

7873 rows × 9 columns

\n", "
" ], "text/plain": [ " notation benennung content registers i \\\n", "225139 IE 5100 Gesammelte Werke NaN NaN 225139 \n", "225140 IE 5101 Einzelwerke NaN NaN 225140 \n", "225143 IE 5107 Gesammelte Werke und Einzelwerke NaN NaN 225143 \n", "225147 IE 5112 Gesammelte Werke und Einzelwerke NaN NaN 225147 \n", "225151 IE 5143 Gesammelte Werke und Einzelwerke NaN NaN 225151 \n", "... ... ... ... ... ... \n", "257594 IZ 9665 Gesammelte Werke und Einzelwerke NaN NaN 257594 \n", "257597 IZ 9705 Gesammelte Werke und Einzelwerke NaN NaN 257597 \n", "257600 IZ 9707 Gesammelte Werke und Einzelwerke NaN NaN 257600 \n", "257603 IZ 9831 Gesammelte Werke und Einzelwerke NaN NaN 257603 \n", "257606 IZ 9841 Gesammelte Werke und Einzelwerke NaN NaN 257606 \n", "\n", " parent_notation parent_benennung haupt_notation_1 \\\n", "225139 IE 5100 - IE 5105 Adam (de la Halle) I \n", "225140 IE 5100 - IE 5105 Adam (de la Halle) I \n", "225143 IE 5107 - IE 5108 Audefroi (le Bastard) I \n", "225147 IE 5112 - IE 5113 Baude, Henri I \n", "225151 IE 5143 - IE 5144 Busnois, Antoine I \n", "... ... ... ... \n", "257594 IZ 9665 - IZ 9666 Rodriguez Castelao, Alfonso I \n", "257597 IZ 9705 - IZ 9706 Toro, Suso de I \n", "257600 IZ 9707 - IZ 9708 Toro, Xelís de I \n", "257603 IZ 9831 - IZ 9832 Vázquez, Pura I \n", "257606 IZ 9841 - IZ 9842 Villar, Domingo I \n", "\n", " haupt_notation_2 \n", "225139 IE \n", "225140 IE \n", "225143 IE \n", "225147 IE \n", "225151 IE \n", "... ... \n", "257594 IZ \n", "257597 IZ \n", "257600 IZ \n", "257603 IZ \n", "257606 IZ \n", "\n", "[7873 rows x 9 columns]" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "rvk_df.loc[rvk_df[\"notation\"].str.contains(\"^I\") & ((rvk_df[\"benennung\"] == \"Gesammelte Werke und Einzelwerke\") | (rvk_df[\"benennung\"] == \"Gesammelte Werke\") | (rvk_df[\"benennung\"] == \"Einzelwerke\"))]" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "for index, row in rvk_df.loc[rvk_df[\"notation\"].str.contains(\"^I\") & ((rvk_df[\"benennung\"] == \"Gesammelte Werke und Einzelwerke\") | (rvk_df[\"benennung\"] == \"Gesammelte Werke\") | (rvk_df[\"benennung\"] == \"Einzelwerke\"))].iterrows():\n", "\n", " concat_df = concat_df.loc[~concat_df[\"RVK_notation\"].fillna(\"\").str.contains(\"^\" + row[\"notation\"])]\n", " \n", " \n", " " ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(93614, 49)" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "concat_df.shape" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "245681639 1\n", "022601708 1\n", "060311606 1\n", "094421439 1\n", "00783747X 1\n", " ..\n", "365694657 1\n", "017233518 1\n", "050775871 1\n", "10366968X 1\n", "084459662 1\n", "Name: ppn, Length: 93614, dtype: int64" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "concat_df.ppn.value_counts()" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [], "source": [ "concat_df.to_parquet(\"./../data/pica_rvp_IA-IZ_pica_sgt_440_860_filtered_year_wo_rvk.parquet\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "interpreter": { "hash": "960f4d658457e0296618a96ff118bd2cbd6b6fed4373572a098ed4ec5a9be4c3" }, "kernelspec": { "display_name": "Python 3.7.6 64-bit ('base': conda)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 2 }