{ "cells": [ { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Import\n", "import pandas\n", "import re\n", "# Alignement \n", "import nltk\n", "import codecs\n", "import gensim\n", "from scipy import sparse\n", "import itertools\n", "from sklearn.feature_extraction.text import CountVectorizer ,TfidfTransformer\n", "import scipy.sparse\n", "import scipy.io\n", "from sklearn import preprocessing\n", "from keras.models import Sequential\n", "from keras.layers.core import Dense, Dropout, Activation,AutoEncoder\n", "from keras.optimizers import SGD\n", "from keras.layers import containers\n", "from mlp import *\n", "import mlp\n", "import sklearn.metrics\n", "import shelve\n", "import pickle\n", "from collections import Counter\n", "import sys\n", "%matplotlib inline" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "ename": "ImportError", "evalue": "cannot import name interfaces", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mImportError\u001b[0m Traceback (most recent call last)", "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[1;32mimport\u001b[0m \u001b[0mgensim\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[1;32m/home/laboinfo/janod/.pyenv/versions/2.7.10/lib/python2.7/site-packages/gensim/__init__.py\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 4\u001b[0m \"\"\"\n\u001b[0;32m 5\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 6\u001b[1;33m \u001b[1;32mfrom\u001b[0m \u001b[0mgensim\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mparsing\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmatutils\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0minterfaces\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcorpora\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmodels\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msimilarities\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msummarization\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 7\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mlogging\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 8\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m/home/laboinfo/janod/.pyenv/versions/2.7.10/lib/python2.7/site-packages/gensim/models/__init__.py\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 6\u001b[0m \u001b[1;31m# bring model classes directly into package namespace, to save some typing\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 7\u001b[1;33m \u001b[1;32mfrom\u001b[0m \u001b[1;33m.\u001b[0m\u001b[0mhdpmodel\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mHdpModel\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 8\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[1;33m.\u001b[0m\u001b[0mldamodel\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mLdaModel\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 9\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[1;33m.\u001b[0m\u001b[0mlsimodel\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mLsiModel\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m/home/laboinfo/janod/.pyenv/versions/2.7.10/lib/python2.7/site-packages/gensim/models/hdpmodel.py\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 41\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mscipy\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mspecial\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0msp\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 42\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 43\u001b[1;33m \u001b[1;32mfrom\u001b[0m \u001b[0mgensim\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0minterfaces\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mutils\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmatutils\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 44\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0msix\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmoves\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mxrange\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 45\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;31mImportError\u001b[0m: cannot import name interfaces" ] } ], "source": [ "import gensim" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "ASR={}\n", "TRS={}\n", "ASR[\"TRAIN\"]=pandas.read_table(\"./ASR/corpus_TRAIN_ASR.srl\",sep=\"\\t\",header=None,na_values=None,keep_default_na=False)\n", "ASR[\"DEV\"]=pandas.read_table(\"./ASR/corpus_DEV_ASR.srl\",sep=\"\\t\",header=None,na_values=None,keep_default_na=False)\n", "ASR[\"TEST\"]=pandas.read_table(\"./ASR/corpus_TEST_ASR.srl\",sep=\"\\t\",header=None,na_values=None,keep_default_na=False)\n", "\n", "TRS[\"TRAIN\"]=pandas.read_table(\"./TRS/corpus_TRAIN_TRS.srl\",sep=\"\\t\",header=None,na_values=None,keep_default_na=False)\n", "TRS[\"DEV\"]=pandas.read_table(\"./TRS/corpus_DEV_TRS.srl\",sep=\"\\t\",header=None,na_values=None,keep_default_na=False)\n", "TRS[\"TEST\"]=pandas.read_table(\"./TRS/corpus_TEST_TRS.srl\",sep=\"\\t\",header=None,na_values=None,keep_default_na=False)\n" ] }, { "cell_type": "code", "execution_count": 251, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
012
0DECODA20101206_RATP_SCD_0424_6là vous répondre là vous répondre enfin ou...
1DECODA20101206_RATP_SCD_0062_1bonsoir oui bonsoir madame je vous téléphone ...
2DECODA20101206_RATP_SCD_0425_6et vous répondre voilà je sais et vous rép...
3DECODA20091112_RATP_SCD_1209_4bonsoir oui bonsoir monsieur bonsoir bonsoir...
4DECODA20091112_RATP_SCD_1208_4bonjour monsieur écoute hein bonjour monsieur...
\n", "
" ], "text/plain": [ " 0 1 \\\n", "0 DECODA 20101206_RATP_SCD_0424_6 \n", "1 DECODA 20101206_RATP_SCD_0062_1 \n", "2 DECODA 20101206_RATP_SCD_0425_6 \n", "3 DECODA 20091112_RATP_SCD_1209_4 \n", "4 DECODA 20091112_RATP_SCD_1208_4 \n", "\n", " 2 \n", "0 là vous répondre là vous répondre enfin ou... \n", "1 bonsoir oui bonsoir madame je vous téléphone ... \n", "2 et vous répondre voilà je sais et vous rép... \n", "3 bonsoir oui bonsoir monsieur bonsoir bonsoir... \n", "4 bonjour monsieur écoute hein bonjour monsieur... " ] }, "execution_count": 251, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ASR[\"TRAIN\"].head()" ] }, { "cell_type": "code", "execution_count": 309, "metadata": { "collapsed": false }, "outputs": [], "source": [ "coucou=ASR[\"TRAIN\"].sort_values(by=[1])[1] == TRS[\"TRAIN\"].sort_values(by=[1])[1]" ] }, { "cell_type": "code", "execution_count": 314, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 314, "metadata": {}, "output_type": "execute_result" } ], "source": [ "coucou.all(axis=0)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "ASR[\"TRAIN\"].sort" ] }, { "cell_type": "code", "execution_count": 192, "metadata": { "collapsed": false }, "outputs": [], "source": [ "tok2 = nltk.RegexpTokenizer(ur\"(?u)\\w+|\\d+(?:[.,]\\d+)?(?:\\s*%)?|\\w'|[^\\w\\s]\",flags=re.UNICODE|re.DOTALL )\n", "#\"\\d+(\\[.,]\\d+)?\\s*%|\\w'|\\w+|[^\\w\\s]\"\n", " # \\d+(\\.\\d+)?\\s*% # les pourcentages\n", " # | \\w' # les contractions d', l', ...\n", " # | \\w+ # les mots pleins\n", " # | [^\\w\\s] # les ponctuations\n", " # (?:[lcdjmnts]|qu)['’] # Contractions\n", " # | http:[^\\s]+\\.\\w{2,3} # Adresses web\n", " # | \\d+[.,]\\d+ # Les réels en/fr\n", " # | [.-]+ # Les ponctuations\n", " # | \\w+ # Les mots pleins\n", " # | [^\\w\\s] \n", "def yield_corpus(df_list):\n", " for corpus in df_list:\n", " for id,doc in corpus.iterrows():\n", " try:\n", " #print doc[2]\n", " yield tok2.tokenize(doc[2].decode(\"utf8\"))\n", " except:\n", " e = sys.exc_info()[0]\n", " print doc[2],e\n", " raise\n", "def yield_corpus_wbw(df_list):\n", " for corpus in df_list:\n", " for id,doc in corpus.iterrows():\n", " try:\n", " for x in tok2.tokenize(doc[2].decode(\"utf-8\")):\n", " yield x.lower()\n", " except:\n", " print doc[2]\n", " raise" ] }, { "cell_type": "code", "execution_count": 411, "metadata": { "collapsed": false }, "outputs": [], "source": [ "vocab1=[x.split(':')[0] for x in codecs.open(\"./lists_mots/listeMots_300WPT__TRAIN_RES.txt\")]\n", "vocab2=[x.split(':')[0] for x in codecs.open(\"./lists_mots/listeMots_300WPT__TRAIN_TRS.txt\")]\n", "vocab3=[x.split(':')[0] for x in codecs.open(\"./lists_mots/listeMots_300WPT__TRAIN_TRS_RES.txt\")]\n", "#vocab3=list(set(vocab1).union(set(vocab2)))" ] }, { "cell_type": "code", "execution_count": 386, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1310 1439 1432\n", "1031\n", "1718\n" ] } ], "source": [ "print len(vocab1),len(vocab2),len(vocab3)\n", "print len(set(vocab1).intersection(set(vocab2)))\n", "print len(set(vocab1).union(set(vocab2)))" ] }, { "cell_type": "code", "execution_count": 399, "metadata": { "collapsed": false }, "outputs": [], "source": [ "ASR_count=Counter(yield_corpus_wbw([ASR[\"TRAIN\"]]))" ] }, { "cell_type": "code", "execution_count": 412, "metadata": { "collapsed": false }, "outputs": [], "source": [ "dico=CountVectorizer(binary=True,vocabulary=vocab3)" ] }, { "cell_type": "code", "execution_count": 413, "metadata": { "collapsed": false }, "outputs": [], "source": [ "sparse=shelve.open(\"Sparse_mat_bin.shelve\",writeback=True)\n", "sparse[\"ASR\"]={}\n", "sparse[\"TRS\"]={}\n", "sparse[\"LABEL\"]={}\n", "sparse[\"LABEL\"][\"TRAIN\"]=ASR[\"TRAIN\"].sort_values(by=[1])[1]\n", "sparse[\"LABEL\"][\"DEV\"]=ASR[\"DEV\"].sort_values(by=[1])[1]\n", "sparse[\"LABEL\"][\"TEST\"]=ASR[\"TEST\"].sort_values(by=[1])[1]\n", "for key in ASR.keys():\n", " sparse[\"ASR\"][key]=dico.transform(ASR[key].sort_values(by=[1])[2])\n", " sparse[\"TRS\"][key]=dico.transform(TRS[key].sort_values(by=[1])[2])\n", "sparse.sync()\n", "sparse.close()" ] }, { "cell_type": "code", "execution_count": 416, "metadata": { "collapsed": false }, "outputs": [], "source": [ "sparse=shelve.open(\"Sparse_mat_tfidf.shelve\",writeback=True)\n", "sparse[\"ASR\"]={}\n", "sparse[\"TRS\"]={}\n", "sparse[\"LABEL\"]={}\n", "sparse[\"LABEL\"][\"TRAIN\"]=ASR[\"TRAIN\"].sort_values(by=[1])[1]\n", "sparse[\"LABEL\"][\"DEV\"]=ASR[\"DEV\"].sort_values(by=[1])[1]\n", "sparse[\"LABEL\"][\"TEST\"]=ASR[\"TEST\"].sort_values(by=[1])[1]\n", "\n", "tf_ASR=TfidfTransformer(norm=\"l2\")\n", "tf_TRS=TfidfTransformer(norm=\"l2\")\n", "tf_TRS.fit(dico2.transform(TRS[\"TRAIN\"].sort_values(by=[1])[2]))\n", "tf_ASR.fit(dico2.transform(ASR[\"TRAIN\"].sort_values(by=[1])[2]))\n", "for key in ASR.keys():\n", " sparse[\"ASR\"][key]=tf_ASR.transform(dico2.transform(ASR[key].sort_values(by=[1])[2]))\n", " sparse[\"TRS\"][key]=tf_TRS.transform(dico2.transform(TRS[key].sort_values(by=[1])[2]))" ] }, { "cell_type": "code", "execution_count": 417, "metadata": { "collapsed": false }, "outputs": [], "source": [ "sparse.sync()\n", "sparse.close()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [], "source": [ "all_vocab=list(set(yield_corpus_wbw([ASR[\"TRAIN\"],TRS[\"TRAIN\"]])))\n", "all_vocab.sort()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "[u'a4', u'abandonn\\xe9', u'abbesses', u'abb\\xe9', u'abcd']" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_vocab[10:15]" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": true }, "outputs": [], "source": [ "dico3=CountVectorizer(binary=True,vocabulary=all_vocab)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false }, "outputs": [], "source": [ "sparse=shelve.open(\"Sparse_mat_bigbin.shelve\",writeback=True)\n", "sparse[\"ASR\"]={}\n", "sparse[\"TRS\"]={}\n", "sparse[\"LABEL\"]={}\n", "sparse[\"LABEL\"][\"TRAIN\"]=ASR[\"TRAIN\"].sort_values(by=[1])[1]\n", "sparse[\"LABEL\"][\"DEV\"]=ASR[\"DEV\"].sort_values(by=[1])[1]\n", "sparse[\"LABEL\"][\"TEST\"]=ASR[\"TEST\"].sort_values(by=[1])[1]\n", "\n", "tf_ASR=TfidfTransformer(norm=\"l2\")\n", "tf_TRS=TfidfTransformer(norm=\"l2\")\n", "tf_TRS.fit(dico3.transform(TRS[\"TRAIN\"].sort_values(by=[1])[2]))\n", "tf_ASR.fit(dico3.transform(ASR[\"TRAIN\"].sort_values(by=[1])[2]))\n", "for key in ASR.keys():\n", " sparse[\"ASR\"][key]=tf_ASR.transform(dico3.transform(ASR[key].sort_values(by=[1])[2]))\n", " sparse[\"TRS\"][key]=tf_TRS.transform(dico3.transform(TRS[key].sort_values(by=[1])[2]))" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": true }, "outputs": [], "source": [ "sparse.sync()\n", "sparse.close()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "sparse=shelve.open(\"Sparse_mat_bigbin.shelve\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.10" } }, "nbformat": 4, "nbformat_minor": 0 }