Spaces:

NTaylor
/

pca_vs_lda

Runtime error

App Files Files Community

NTaylor commited on Apr 23, 2023

Commit

ad0428a

1 Parent(s): 97f5149

Upload app.py

Browse files

Files changed (1) hide show

app.py +104 -0

app.py ADDED Viewed

	@@ -0,0 +1,104 @@

+"""
+=======================================================
+Comparison of LDA and PCA 2D projection of Iris dataset
+=======================================================
+The Iris dataset represents 3 kind of Iris flowers (Setosa, Versicolour
+and Virginica) with 4 attributes: sepal length, sepal width, petal length
+and petal width.
+Principal Component Analysis (PCA) applied to this data identifies the
+combination of attributes (principal components, or directions in the
+feature space) that account for the most variance in the data. Here we
+plot the different samples on the 2 first principal components.
+Linear Discriminant Analysis (LDA) tries to identify attributes that
+account for the most variance *between classes*. In particular,
+LDA, in contrast to PCA, is a supervised method, using known class labels.
+"""
+import matplotlib.pyplot as plt
+import gradio as gr
+from sklearn import datasets
+from sklearn.decomposition import PCA
+from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
+# load data
+iris = datasets.load_iris()
+X = iris.data
+y = iris.target
+target_names = iris.target_names
+# fit PCA
+pca = PCA(n_components=2)
+X_r = pca.fit(X).transform(X)
+# fit LDA
+lda = LinearDiscriminantAnalysis(n_components=2)
+X_r2 = lda.fit(X, y).transform(X)
+# Percentage of variance explained for each components
+print(
+    "explained variance ratio (first two components): %s"
+    % str(pca.explained_variance_ratio_)
+)
+# save models using skop
+def plot_lda_pca():
+    fig = plt.figure(1, facecolor="w", figsize=(5,5))
+    colors = ["navy", "turquoise", "darkorange"]
+    lw = 2
+    for color, i, target_name in zip(colors, [0, 1, 2], target_names):
+        plt.scatter(
+            X_r[y == i, 0], X_r[y == i, 1], color=color, alpha=0.8, lw=lw, label=target_name
+        )
+    plt.legend(loc="best", shadow=False, scatterpoints=1)
+    plt.title("PCA of IRIS dataset")
+    for color, i, target_name in zip(colors, [0, 1, 2], target_names):
+        plt.scatter(
+            X_r2[y == i, 0], X_r2[y == i, 1], alpha=0.8, color=color, label=target_name
+        )
+    plt.legend(loc="best", shadow=False, scatterpoints=1)
+    plt.title("LDA of IRIS dataset")
+    return fig
+title = "2-D projection of Iris dataset using LDA and PCA"
+with gr.Blocks(title=title) as demo:
+    gr.Markdown(f"# {title}")
+    gr.Markdown(" This example shows how one can use Prinicipal Components Analysis (PCA) and Factor Analysis (FA) for model selection by observing the likelihood of a held-out dataset with added noise <br>"
+    " The number of samples (n_samples) will determine the number of data points to produce.  <br>"
+    " The number of components (n_components) will determine the number of components each method will fit to, and will affect the likelihood of the held-out set.  <br>"
+    " The number of features (n_components) determine the number of features the toy dataset X variable will have.  <br>"
+    " For further details please see the sklearn docs:"
+    )
+    gr.Markdown(" **[Demo is based on sklearn docs found here](https://scikit-learn.org/stable/auto_examples/decomposition/plot_pca_vs_lda.html#sphx-glr-auto-examples-decomposition-plot-pca-vs-lda-py)** <br>")
+    gr.Markdown(" **Dataset** : A toy dataset with corrupted with homoscedastic noise (noise variance is the same for each feature) or heteroscedastic noise (noise variance is the different for each feature) . <br>")
+    gr.Markdown(" Different number of features and number of components affect how well the low rank space is recovered. <br>"
+                "  Larger Depth trying to overfit and learn even the finner details of the data.<br>"
+               )
+    # with gr.Row():
+    #     n_samples = gr.Slider(value=100, minimum=10, maximum=1000, step=10, label="n_samples")
+    #     n_components = gr.Slider(value=2, minimum=1, maximum=20, step=1, label="n_components")
+    #     n_features = gr.Slider(value=5, minimum=5, maximum=25, step=1, label="n_features")
+      # options for n_components
+    btn = gr.Button(value="Run")
+    btn.click(plot_lda_pca, outputs= gr.Plot(label='PCA vs LDA clustering') ) #
+demo.launch()