<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: ML algorithm was given empty dataset. in Data &amp; Schema Discussions</title>
    <link>https://community.incorta.com/t5/data-schema-discussions/ml-algorithm-was-given-empty-dataset/m-p/6410#M559</link>
    <description>&lt;P&gt;This issue has been resolved using the property&amp;nbsp;&lt;STRONG&gt;&lt;I&gt;spark.dataframe.sampling.enabled&lt;/I&gt;&lt;/STRONG&gt;&lt;SPAN&gt;&amp;nbsp; == false&amp;nbsp;&lt;/SPAN&gt;&lt;/P&gt;</description>
    <pubDate>Mon, 14 Apr 2025 18:05:00 GMT</pubDate>
    <dc:creator>shaikshoaib</dc:creator>
    <dc:date>2025-04-14T18:05:00Z</dc:date>
    <item>
      <title>ML algorithm was given empty dataset.</title>
      <link>https://community.incorta.com/t5/data-schema-discussions/ml-algorithm-was-given-empty-dataset/m-p/6400#M558</link>
      <description>&lt;P&gt;Hi team,&lt;/P&gt;&lt;P&gt;I am trying a RF model in the incorta.&lt;/P&gt;&lt;P&gt;even after splitting the data as suggested below i am getting a error as follows&amp;nbsp;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;A href="https://community.incorta.com/t5/data-schemas-knowledgebase/split-a-dataset-into-training-and-testing-data-sets/ta-p/198" target="_blank"&gt;https://community.incorta.com/t5/data-schemas-knowledgebase/split-a-dataset-into-training-and-testing-data-sets/ta-p/198&lt;/A&gt;&lt;/P&gt;&lt;PRE&gt;Transformation error 25/04/13 05:28:44 ERROR Instrumentation: org.apache.spark.SparkException: ML algorithm was given empty dataset.
Error An error occurred while calling o341.fit.&lt;/PRE&gt;&lt;P&gt;%pyspark&lt;BR /&gt;from pyspark.sql import SparkSession&lt;BR /&gt;from pyspark.ml.feature import VectorAssembler&lt;BR /&gt;from pyspark.ml.classification import RandomForestClassifier&lt;BR /&gt;from pyspark.ml.evaluation import MulticlassClassificationEvaluator&lt;BR /&gt;from pyspark.sql.functions import co&lt;/P&gt;&lt;P&gt;&lt;BR /&gt;# Load dataset&lt;BR /&gt;df_train = read("machine_learning.train_data")&lt;BR /&gt;df_test = read("machine_learning.Test_data")&lt;/P&gt;&lt;P&gt;# Define features and label&lt;BR /&gt;feature_cols = ['Temperature', 'Pressure', 'Vibration_Level', 'Humidity', 'Power_Consumption']&lt;BR /&gt;label_col = 'Failure_Status'&lt;/P&gt;&lt;P&gt;# Assemble features into a single vector&lt;BR /&gt;assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")&lt;BR /&gt;df_vector = assembler.transform(df_train).select("features", col(label_col).cast("int").alias(label_col))&lt;/P&gt;&lt;P&gt;# Train-test split&lt;BR /&gt;# train_data, test_data = df_vector.randomSplit([0.8, 0.2], seed=42)&lt;/P&gt;&lt;P&gt;assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")&lt;BR /&gt;df_train_vector = assembler.transform(df_train).select("features", col(label_col).cast("int").alias(label_col))&lt;BR /&gt;df_test_vector = assembler.transform(df_test).select("features", col(label_col).cast("int").alias(label_col))&lt;/P&gt;&lt;P&gt;# Train Random Forest model&lt;BR /&gt;rf = RandomForestClassifier(labelCol=label_col, featuresCol="features", numTrees=100, seed=42)&lt;BR /&gt;rf_model = rf.fit(df_train_vector)&lt;/P&gt;&lt;P&gt;# Predict on test set&lt;BR /&gt;predictions = rf_model.transform(df_test_vector)&lt;/P&gt;&lt;P&gt;# Evaluate accuracy&lt;BR /&gt;# evaluator = MulticlassClassificationEvaluator(labelCol=label_col, predictionCol="prediction", metricName="accuracy")&lt;BR /&gt;# accuracy = evaluator.evaluate(predictions)&lt;BR /&gt;# print("Accuracy:", accuracy)&lt;/P&gt;&lt;P&gt;# Extract predictions and actual labels&lt;BR /&gt;# y_pred = predictions.select("prediction").rdd.flatMap(lambda x: x).collect()&lt;BR /&gt;# y_true = predictions.select(label_col).rdd.flatMap(lambda x: x).collect()&lt;/P&gt;&lt;P&gt;# Print classification report&lt;BR /&gt;# print(classification_report(y_true, y_pred))&lt;/P&gt;&lt;P&gt;# Combine results for inspection&lt;BR /&gt;# results = predictions.select(*feature_cols, label_col, col("prediction").alias("Predicted_Failure_Status"))&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;# Show first 10 rows&lt;BR /&gt;predictions.show(10)&lt;BR /&gt;save(predictions)&lt;/P&gt;&lt;P&gt;&amp;nbsp;Can any one help what is the issue here ?&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Sun, 13 Apr 2025 05:33:37 GMT</pubDate>
      <guid>https://community.incorta.com/t5/data-schema-discussions/ml-algorithm-was-given-empty-dataset/m-p/6400#M558</guid>
      <dc:creator>shaikshoaib</dc:creator>
      <dc:date>2025-04-13T05:33:37Z</dc:date>
    </item>
    <item>
      <title>Re: ML algorithm was given empty dataset.</title>
      <link>https://community.incorta.com/t5/data-schema-discussions/ml-algorithm-was-given-empty-dataset/m-p/6410#M559</link>
      <description>&lt;P&gt;This issue has been resolved using the property&amp;nbsp;&lt;STRONG&gt;&lt;I&gt;spark.dataframe.sampling.enabled&lt;/I&gt;&lt;/STRONG&gt;&lt;SPAN&gt;&amp;nbsp; == false&amp;nbsp;&lt;/SPAN&gt;&lt;/P&gt;</description>
      <pubDate>Mon, 14 Apr 2025 18:05:00 GMT</pubDate>
      <guid>https://community.incorta.com/t5/data-schema-discussions/ml-algorithm-was-given-empty-dataset/m-p/6410#M559</guid>
      <dc:creator>shaikshoaib</dc:creator>
      <dc:date>2025-04-14T18:05:00Z</dc:date>
    </item>
    <item>
      <title>Re: ML algorithm was given empty dataset.</title>
      <link>https://community.incorta.com/t5/data-schema-discussions/ml-algorithm-was-given-empty-dataset/m-p/6421#M561</link>
      <description>&lt;P&gt;Glad you were able to figure it out! This would have been my suggestion.&amp;nbsp;&lt;BR /&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;/P&gt;</description>
      <pubDate>Wed, 16 Apr 2025 20:55:14 GMT</pubDate>
      <guid>https://community.incorta.com/t5/data-schema-discussions/ml-algorithm-was-given-empty-dataset/m-p/6421#M561</guid>
      <dc:creator>JoeM</dc:creator>
      <dc:date>2025-04-16T20:55:14Z</dc:date>
    </item>
  </channel>
</rss>

