[04/15] madlib-site git commit: jupyter notebooks for 1.14 release
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/418f361c/community-artifacts/Term-frequency-v1.ipynb -- diff --git a/community-artifacts/Term-frequency-v1.ipynb b/community-artifacts/Term-frequency-v1.ipynb new file mode 100644 index 000..99a0cd0 --- /dev/null +++ b/community-artifacts/Term-frequency-v1.ipynb @@ -0,0 +1,1062 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"# Term Frequency\n", +"Term frequency computes the number of times that a word or term occurs in a document. Term frequency is often used as part of a larger text processing pipeline, which may include operations such as stemming, stop word removal and topic modelling." + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ +{ + "name": "stdout", + "output_type": "stream", + "text": [ + "The sql extension is already loaded. To reload it, use:\n", + " %reload_ext sql\n" + ] +} + ], + "source": [ +"%load_ext sql" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ +{ + "data": { + "text/plain": [ + "u'Connected: fmcquillan@madlib'" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" +} + ], + "source": [ +"# Greenplum 4.3.10.0\n", +"# %sql postgresql://gpdbchina@10.194.10.68:61000/madlib\n", +"\n", +"# PostgreSQL local\n", +"%sql postgresql://fmcquillan@localhost:5432/madlib" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ +{ + "name": "stdout", + "output_type": "stream", + "text": [ + "1 rows affected.\n" + ] +}, +{ + "data": { + "text/html": [ + "\n", + "\n", + "version\n", + "\n", + "\n", + "MADlib version: 1.13, git revision: unknown, cmake configuration time: Wed Dec 20 08:02:21 UTC 2017, build type: Release, build system: Darwin-17.3.0, C compiler: Clang, C++ compiler: Clang\n", + "\n", + "" + ], + "text/plain": [ + "[(u'MADlib version: 1.13, git revision: unknown, cmake configuration time: Wed Dec 20 08:02:21 UTC 2017, build type: Release, build system: Darwin-17.3.0, C compiler: Clang, C++ compiler: Clang',)]" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" +} + ], + "source": [ +"%sql select madlib.version();\n", +"#%sql select version();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"# 1. Prepare documents\n", +"First we create a document table with one document per row:" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [ +{ + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "Done.\n", + "4 rows affected.\n", + "4 rows affected.\n" + ] +}, +{ + "data": { + "text/html": [ + "\n", + "\n", + "docid\n", + "contents\n", + "\n", + "\n", + "0\n", + "I like to eat broccoli and bananas. I ate a banana and spinach smoothie for breakfast.\n", + "\n", + "\n", + "1\n", + "Chinchillas and kittens are cute.\n", + "\n", + "\n", + "2\n", + "My sister adopted two kittens yesterday.\n", + "\n", + "\n", + "3\n", + "Look at this cute hamster munching on a piece of broccoli.\n", + "\n", + "" + ], + "text/plain": [ + "[(0, u'I like to eat broccoli and bananas. I ate a banana and spinach smoothie for breakfast.'),\n", + " (1, u'Chinchillas and kittens are cute.'),\n", + " (2, u'My sister adopted two kittens yesterday.'),\n", + " (3, u'Look at this cute hamster munching on a piece of broccoli.')]" + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" +} + ], + "source": [ +"%%sql\n", +"DROP TABLE IF EXISTS documents;\n", +"CREATE TABLE documents(docid INT4, contents TEXT);\n", +"\n", +"INSERT INTO documents VALUES\n", +"(0, 'I like to eat broccoli and bananas. I ate a banana and spinach smoothie for breakfast.'),\n", +"(1, 'Chinchillas and kittens are cute.'),\n", +"(2, 'My sister adopted two kittens yesterday.'),\n", +"(3, 'Look at this cute hamster munching on a piece of broccoli.');\n", +"\n", +"SELECT * from documents ORDER BY docid;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"You can
[04/15] madlib-site git commit: jupyter notebooks for 1.14 release
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/3f849b9e/community-artifacts/Term-frequency-v1.ipynb -- diff --git a/community-artifacts/Term-frequency-v1.ipynb b/community-artifacts/Term-frequency-v1.ipynb new file mode 100644 index 000..99a0cd0 --- /dev/null +++ b/community-artifacts/Term-frequency-v1.ipynb @@ -0,0 +1,1062 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"# Term Frequency\n", +"Term frequency computes the number of times that a word or term occurs in a document. Term frequency is often used as part of a larger text processing pipeline, which may include operations such as stemming, stop word removal and topic modelling." + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ +{ + "name": "stdout", + "output_type": "stream", + "text": [ + "The sql extension is already loaded. To reload it, use:\n", + " %reload_ext sql\n" + ] +} + ], + "source": [ +"%load_ext sql" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ +{ + "data": { + "text/plain": [ + "u'Connected: fmcquillan@madlib'" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" +} + ], + "source": [ +"# Greenplum 4.3.10.0\n", +"# %sql postgresql://gpdbchina@10.194.10.68:61000/madlib\n", +"\n", +"# PostgreSQL local\n", +"%sql postgresql://fmcquillan@localhost:5432/madlib" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ +{ + "name": "stdout", + "output_type": "stream", + "text": [ + "1 rows affected.\n" + ] +}, +{ + "data": { + "text/html": [ + "\n", + "\n", + "version\n", + "\n", + "\n", + "MADlib version: 1.13, git revision: unknown, cmake configuration time: Wed Dec 20 08:02:21 UTC 2017, build type: Release, build system: Darwin-17.3.0, C compiler: Clang, C++ compiler: Clang\n", + "\n", + "" + ], + "text/plain": [ + "[(u'MADlib version: 1.13, git revision: unknown, cmake configuration time: Wed Dec 20 08:02:21 UTC 2017, build type: Release, build system: Darwin-17.3.0, C compiler: Clang, C++ compiler: Clang',)]" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" +} + ], + "source": [ +"%sql select madlib.version();\n", +"#%sql select version();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"# 1. Prepare documents\n", +"First we create a document table with one document per row:" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [ +{ + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "Done.\n", + "4 rows affected.\n", + "4 rows affected.\n" + ] +}, +{ + "data": { + "text/html": [ + "\n", + "\n", + "docid\n", + "contents\n", + "\n", + "\n", + "0\n", + "I like to eat broccoli and bananas. I ate a banana and spinach smoothie for breakfast.\n", + "\n", + "\n", + "1\n", + "Chinchillas and kittens are cute.\n", + "\n", + "\n", + "2\n", + "My sister adopted two kittens yesterday.\n", + "\n", + "\n", + "3\n", + "Look at this cute hamster munching on a piece of broccoli.\n", + "\n", + "" + ], + "text/plain": [ + "[(0, u'I like to eat broccoli and bananas. I ate a banana and spinach smoothie for breakfast.'),\n", + " (1, u'Chinchillas and kittens are cute.'),\n", + " (2, u'My sister adopted two kittens yesterday.'),\n", + " (3, u'Look at this cute hamster munching on a piece of broccoli.')]" + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" +} + ], + "source": [ +"%%sql\n", +"DROP TABLE IF EXISTS documents;\n", +"CREATE TABLE documents(docid INT4, contents TEXT);\n", +"\n", +"INSERT INTO documents VALUES\n", +"(0, 'I like to eat broccoli and bananas. I ate a banana and spinach smoothie for breakfast.'),\n", +"(1, 'Chinchillas and kittens are cute.'),\n", +"(2, 'My sister adopted two kittens yesterday.'),\n", +"(3, 'Look at this cute hamster munching on a piece of broccoli.');\n", +"\n", +"SELECT * from documents ORDER BY docid;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"You can