ultralytics 8.0.239 Ultralytics Actions and hub-sdk adoption (#7431)

Signed-off-by: Glenn Jocher <glenn.jocher@ultralytics.com> Co-authored-by: UltralyticsAssistant <web@ultralytics.com> Co-authored-by: Burhan <62214284+Burhan-Q@users.noreply.github.com> Co-authored-by: Kayzwer <68285002+Kayzwer@users.noreply.github.com>
2024-01-10 03:16:08 +01:00 · 2024-01-10 03:16:08 +01:00 · fe27db2f6e
commit fe27db2f6e
parent e795277391
139 changed files with 6870 additions and 5125 deletions
--- a/ultralytics/data/explorer/explorer.py
+++ b/ultralytics/data/explorer/explorer.py
@ -22,7 +22,6 @@ from .utils import get_sim_index_schema, get_table_schema, plot_query_result, pr


 class ExplorerDataset(YOLODataset):
-
    def __init__(self, *args, data: dict = None, **kwargs) -> None:
        super().__init__(*args, data=data, **kwargs)

@ -35,7 +34,7 @@ class ExplorerDataset(YOLODataset):
            else:  # read image
                im = cv2.imread(f)  # BGR
                if im is None:
-                    raise FileNotFoundError(f'Image Not Found {f}')
+                    raise FileNotFoundError(f"Image Not Found {f}")
            h0, w0 = im.shape[:2]  # orig hw
            return im, (h0, w0), im.shape[:2]

@ -44,7 +43,7 @@ class ExplorerDataset(YOLODataset):
    def build_transforms(self, hyp: IterableSimpleNamespace = None):
        """Creates transforms for dataset images without resizing."""
        return Format(
-            bbox_format='xyxy',
+            bbox_format="xyxy",
            normalize=False,
            return_mask=self.use_segments,
            return_keypoint=self.use_keypoints,
@ -55,17 +54,16 @@ class ExplorerDataset(YOLODataset):


 class Explorer:
-
-    def __init__(self,
-                 data: Union[str, Path] = 'coco128.yaml',
-                 model: str = 'yolov8n.pt',
-                 uri: str = '~/ultralytics/explorer') -> None:
-        checks.check_requirements(['lancedb>=0.4.3', 'duckdb'])
+    def __init__(
+        self, data: Union[str, Path] = "coco128.yaml", model: str = "yolov8n.pt", uri: str = "~/ultralytics/explorer"
+    ) -> None:
+        checks.check_requirements(["lancedb>=0.4.3", "duckdb"])
        import lancedb

        self.connection = lancedb.connect(uri)
-        self.table_name = Path(data).name.lower() + '_' + model.lower()
-        self.sim_idx_base_name = f'{self.table_name}_sim_idx'.lower(
+        self.table_name = Path(data).name.lower() + "_" + model.lower()
+        self.sim_idx_base_name = (
+            f"{self.table_name}_sim_idx".lower()
        )  # Use this name and append thres and top_k to reuse the table
        self.model = YOLO(model)
        self.data = data  # None
@ -74,7 +72,7 @@ class Explorer:
        self.table = None
        self.progress = 0

-    def create_embeddings_table(self, force: bool = False, split: str = 'train') -> None:
+    def create_embeddings_table(self, force: bool = False, split: str = "train") -> None:
        """
        Create LanceDB table containing the embeddings of the images in the dataset. The table will be reused if it
        already exists. Pass force=True to overwrite the existing table.
@ -90,20 +88,20 @@ class Explorer:
            ```
        """
        if self.table is not None and not force:
-            LOGGER.info('Table already exists. Reusing it. Pass force=True to overwrite it.')
+            LOGGER.info("Table already exists. Reusing it. Pass force=True to overwrite it.")
            return
        if self.table_name in self.connection.table_names() and not force:
-            LOGGER.info(f'Table {self.table_name} already exists. Reusing it. Pass force=True to overwrite it.')
+            LOGGER.info(f"Table {self.table_name} already exists. Reusing it. Pass force=True to overwrite it.")
            self.table = self.connection.open_table(self.table_name)
            self.progress = 1
            return
        if self.data is None:
-            raise ValueError('Data must be provided to create embeddings table')
+            raise ValueError("Data must be provided to create embeddings table")

        data_info = check_det_dataset(self.data)
        if split not in data_info:
            raise ValueError(
-                f'Split {split} is not found in the dataset. Available keys in the dataset are {list(data_info.keys())}'
+                f"Split {split} is not found in the dataset. Available keys in the dataset are {list(data_info.keys())}"
            )

        choice_set = data_info[split]
@ -113,13 +111,16 @@ class Explorer:

        # Create the table schema
        batch = dataset[0]
-        vector_size = self.model.embed(batch['im_file'], verbose=False)[0].shape[0]
-        table = self.connection.create_table(self.table_name, schema=get_table_schema(vector_size), mode='overwrite')
+        vector_size = self.model.embed(batch["im_file"], verbose=False)[0].shape[0]
+        table = self.connection.create_table(self.table_name, schema=get_table_schema(vector_size), mode="overwrite")
        table.add(
-            self._yield_batches(dataset,
-                                data_info,
-                                self.model,
-                                exclude_keys=['img', 'ratio_pad', 'resized_shape', 'ori_shape', 'batch_idx']))
+            self._yield_batches(
+                dataset,
+                data_info,
+                self.model,
+                exclude_keys=["img", "ratio_pad", "resized_shape", "ori_shape", "batch_idx"],
+            )
+        )

        self.table = table

@ -131,12 +132,12 @@ class Explorer:
            for k in exclude_keys:
                batch.pop(k, None)
            batch = sanitize_batch(batch, data_info)
-            batch['vector'] = model.embed(batch['im_file'], verbose=False)[0].detach().tolist()
+            batch["vector"] = model.embed(batch["im_file"], verbose=False)[0].detach().tolist()
            yield [batch]

-    def query(self,
-              imgs: Union[str, np.ndarray, List[str], List[np.ndarray]] = None,
-              limit: int = 25) -> Any:  # pyarrow.Table
+    def query(
+        self, imgs: Union[str, np.ndarray, List[str], List[np.ndarray]] = None, limit: int = 25
+    ) -> Any:  # pyarrow.Table
        """
        Query the table for similar images. Accepts a single image or a list of images.

@ -157,18 +158,18 @@ class Explorer:
            ```
        """
        if self.table is None:
-            raise ValueError('Table is not created. Please create the table first.')
+            raise ValueError("Table is not created. Please create the table first.")
        if isinstance(imgs, str):
            imgs = [imgs]
-        assert isinstance(imgs, list), f'img must be a string or a list of strings. Got {type(imgs)}'
+        assert isinstance(imgs, list), f"img must be a string or a list of strings. Got {type(imgs)}"
        embeds = self.model.embed(imgs)
        # Get avg if multiple images are passed (len > 1)
        embeds = torch.mean(torch.stack(embeds), 0).cpu().numpy() if len(embeds) > 1 else embeds[0].cpu().numpy()
        return self.table.search(embeds).limit(limit).to_arrow()

-    def sql_query(self,
-                  query: str,
-                  return_type: str = 'pandas') -> Union[DataFrame, Any, None]:  # pandas.dataframe or pyarrow.Table
+    def sql_query(
+        self, query: str, return_type: str = "pandas"
+    ) -> Union[DataFrame, Any, None]:  # pandas.dataframe or pyarrow.Table
        """
        Run a SQL-Like query on the table. Utilizes LanceDB predicate pushdown.

@ -187,27 +188,29 @@ class Explorer:
            result = exp.sql_query(query)
            ```
        """
-        assert return_type in ['pandas',
-                               'arrow'], f'Return type should be either `pandas` or `arrow`, but got {return_type}'
+        assert return_type in [
+            "pandas",
+            "arrow",
+        ], f"Return type should be either `pandas` or `arrow`, but got {return_type}"
        import duckdb

        if self.table is None:
-            raise ValueError('Table is not created. Please create the table first.')
+            raise ValueError("Table is not created. Please create the table first.")

        # Note: using filter pushdown would be a better long term solution. Temporarily using duckdb for this.
        table = self.table.to_arrow()  # noqa NOTE: Don't comment this. This line is used by DuckDB
-        if not query.startswith('SELECT') and not query.startswith('WHERE'):
+        if not query.startswith("SELECT") and not query.startswith("WHERE"):
            raise ValueError(
-                f'Query must start with SELECT or WHERE. You can either pass the entire query or just the WHERE clause. found {query}'
+                f"Query must start with SELECT or WHERE. You can either pass the entire query or just the WHERE clause. found {query}"
            )
-        if query.startswith('WHERE'):
+        if query.startswith("WHERE"):
            query = f"SELECT * FROM 'table' {query}"
-        LOGGER.info(f'Running query: {query}')
+        LOGGER.info(f"Running query: {query}")

        rs = duckdb.sql(query)
-        if return_type == 'pandas':
+        if return_type == "pandas":
            return rs.df()
-        elif return_type == 'arrow':
+        elif return_type == "arrow":
            return rs.arrow()

    def plot_sql_query(self, query: str, labels: bool = True) -> Image.Image:
@ -228,18 +231,20 @@ class Explorer:
            result = exp.plot_sql_query(query)
            ```
        """
-        result = self.sql_query(query, return_type='arrow')
+        result = self.sql_query(query, return_type="arrow")
        if len(result) == 0:
-            LOGGER.info('No results found.')
+            LOGGER.info("No results found.")
            return None
        img = plot_query_result(result, plot_labels=labels)
        return Image.fromarray(img)

-    def get_similar(self,
-                    img: Union[str, np.ndarray, List[str], List[np.ndarray]] = None,
-                    idx: Union[int, List[int]] = None,
-                    limit: int = 25,
-                    return_type: str = 'pandas') -> Union[DataFrame, Any]:  # pandas.dataframe or pyarrow.Table
+    def get_similar(
+        self,
+        img: Union[str, np.ndarray, List[str], List[np.ndarray]] = None,
+        idx: Union[int, List[int]] = None,
+        limit: int = 25,
+        return_type: str = "pandas",
+    ) -> Union[DataFrame, Any]:  # pandas.dataframe or pyarrow.Table
        """
        Query the table for similar images. Accepts a single image or a list of images.

@ -259,21 +264,25 @@ class Explorer:
            similar = exp.get_similar(img='https://ultralytics.com/images/zidane.jpg')
            ```
        """
-        assert return_type in ['pandas',
-                               'arrow'], f'Return type should be either `pandas` or `arrow`, but got {return_type}'
+        assert return_type in [
+            "pandas",
+            "arrow",
+        ], f"Return type should be either `pandas` or `arrow`, but got {return_type}"
        img = self._check_imgs_or_idxs(img, idx)
        similar = self.query(img, limit=limit)

-        if return_type == 'pandas':
+        if return_type == "pandas":
            return similar.to_pandas()
-        elif return_type == 'arrow':
+        elif return_type == "arrow":
            return similar

-    def plot_similar(self,
-                     img: Union[str, np.ndarray, List[str], List[np.ndarray]] = None,
-                     idx: Union[int, List[int]] = None,
-                     limit: int = 25,
-                     labels: bool = True) -> Image.Image:
+    def plot_similar(
+        self,
+        img: Union[str, np.ndarray, List[str], List[np.ndarray]] = None,
+        idx: Union[int, List[int]] = None,
+        limit: int = 25,
+        labels: bool = True,
+    ) -> Image.Image:
        """
        Plot the similar images. Accepts images or indexes.

@ -293,9 +302,9 @@ class Explorer:
            similar = exp.plot_similar(img='https://ultralytics.com/images/zidane.jpg')
            ```
        """
-        similar = self.get_similar(img, idx, limit, return_type='arrow')
+        similar = self.get_similar(img, idx, limit, return_type="arrow")
        if len(similar) == 0:
-            LOGGER.info('No results found.')
+            LOGGER.info("No results found.")
            return None
        img = plot_query_result(similar, plot_labels=labels)
        return Image.fromarray(img)
@ -323,34 +332,37 @@ class Explorer:
            ```
        """
        if self.table is None:
-            raise ValueError('Table is not created. Please create the table first.')
-        sim_idx_table_name = f'{self.sim_idx_base_name}_thres_{max_dist}_top_{top_k}'.lower()
+            raise ValueError("Table is not created. Please create the table first.")
+        sim_idx_table_name = f"{self.sim_idx_base_name}_thres_{max_dist}_top_{top_k}".lower()
        if sim_idx_table_name in self.connection.table_names() and not force:
-            LOGGER.info('Similarity matrix already exists. Reusing it. Pass force=True to overwrite it.')
+            LOGGER.info("Similarity matrix already exists. Reusing it. Pass force=True to overwrite it.")
            return self.connection.open_table(sim_idx_table_name).to_pandas()

        if top_k and not (1.0 >= top_k >= 0.0):
-            raise ValueError(f'top_k must be between 0.0 and 1.0. Got {top_k}')
+            raise ValueError(f"top_k must be between 0.0 and 1.0. Got {top_k}")
        if max_dist < 0.0:
-            raise ValueError(f'max_dist must be greater than 0. Got {max_dist}')
+            raise ValueError(f"max_dist must be greater than 0. Got {max_dist}")

        top_k = int(top_k * len(self.table)) if top_k else len(self.table)
        top_k = max(top_k, 1)
-        features = self.table.to_lance().to_table(columns=['vector', 'im_file']).to_pydict()
-        im_files = features['im_file']
-        embeddings = features['vector']
+        features = self.table.to_lance().to_table(columns=["vector", "im_file"]).to_pydict()
+        im_files = features["im_file"]
+        embeddings = features["vector"]

-        sim_table = self.connection.create_table(sim_idx_table_name, schema=get_sim_index_schema(), mode='overwrite')
+        sim_table = self.connection.create_table(sim_idx_table_name, schema=get_sim_index_schema(), mode="overwrite")

        def _yield_sim_idx():
            """Generates a dataframe with similarity indices and distances for images."""
            for i in tqdm(range(len(embeddings))):
-                sim_idx = self.table.search(embeddings[i]).limit(top_k).to_pandas().query(f'_distance <= {max_dist}')
-                yield [{
-                    'idx': i,
-                    'im_file': im_files[i],
-                    'count': len(sim_idx),
-                    'sim_im_files': sim_idx['im_file'].tolist()}]
+                sim_idx = self.table.search(embeddings[i]).limit(top_k).to_pandas().query(f"_distance <= {max_dist}")
+                yield [
+                    {
+                        "idx": i,
+                        "im_file": im_files[i],
+                        "count": len(sim_idx),
+                        "sim_im_files": sim_idx["im_file"].tolist(),
+                    }
+                ]

        sim_table.add(_yield_sim_idx())
        self.sim_index = sim_table
@ -381,7 +393,7 @@ class Explorer:
            ```
        """
        sim_idx = self.similarity_index(max_dist=max_dist, top_k=top_k, force=force)
-        sim_count = sim_idx['count'].tolist()
+        sim_count = sim_idx["count"].tolist()
        sim_count = np.array(sim_count)

        indices = np.arange(len(sim_count))
@ -390,25 +402,26 @@ class Explorer:
        plt.bar(indices, sim_count)

        # Customize the plot (optional)
-        plt.xlabel('data idx')
-        plt.ylabel('Count')
-        plt.title('Similarity Count')
+        plt.xlabel("data idx")
+        plt.ylabel("Count")
+        plt.title("Similarity Count")
        buffer = BytesIO()
-        plt.savefig(buffer, format='png')
+        plt.savefig(buffer, format="png")
        buffer.seek(0)

        # Use Pillow to open the image from the buffer
        return Image.fromarray(np.array(Image.open(buffer)))

-    def _check_imgs_or_idxs(self, img: Union[str, np.ndarray, List[str], List[np.ndarray], None],
-                            idx: Union[None, int, List[int]]) -> List[np.ndarray]:
+    def _check_imgs_or_idxs(
+        self, img: Union[str, np.ndarray, List[str], List[np.ndarray], None], idx: Union[None, int, List[int]]
+    ) -> List[np.ndarray]:
        if img is None and idx is None:
-            raise ValueError('Either img or idx must be provided.')
+            raise ValueError("Either img or idx must be provided.")
        if img is not None and idx is not None:
-            raise ValueError('Only one of img or idx must be provided.')
+            raise ValueError("Only one of img or idx must be provided.")
        if idx is not None:
            idx = idx if isinstance(idx, list) else [idx]
-            img = self.table.to_lance().take(idx, columns=['im_file']).to_pydict()['im_file']
+            img = self.table.to_lance().take(idx, columns=["im_file"]).to_pydict()["im_file"]

        return img if isinstance(img, list) else [img]

@ -433,7 +446,7 @@ class Explorer:
        try:
            df = self.sql_query(result)
        except Exception as e:
-            LOGGER.error('AI generated query is not valid. Please try again with a different prompt')
+            LOGGER.error("AI generated query is not valid. Please try again with a different prompt")
            LOGGER.error(e)
            return None
        return df