代码拉取完成,页面将自动刷新
<!-- livebook:{"file_entries":[{"name":"xla_extension-x86_64-linux-gnu-cuda118.tar.gzxla_extension-x86_64-linux-gnu-cuda118.tar.gz","type":"attachment"}]} -->
# TrainHub
```elixir
# File.rm("/home/user/.cache/xla/0.5.1/cache/download/xla_extension-x86_64-linux-gnu-cuda118.tar.gz")
# File.cp("/home/user/files/xla_extension-x86_64-linux-gnu-cuda118.tar.gzxla_extension-x86_64-linux-gnu-cuda118.tar.gz","/home/user/.cache/xla/0.5.1/cache/download/xla_extension-x86_64-linux-gnu-cuda118.tar.gz")
Mix.install(
[
# Data Collection
{:tentacat, "~> 2.0"},
# Machine Learning
{:axon, git: "https://gitee.com/mirrors_elixir-nx/axon", branch: "main", override: true},
{:bumblebee, git: "https://gitee.com/edmondfrank/bumblebee", branch: "main", force: true},
{:polaris, "~> 0.1"},
{:exla, "~> 0.6", override: true},
{:nx, "~> 0.6", override: true},
{:httpoison, "~> 2.2"}
],
system_env: %{
"XLA_TARGET" => "cuda118"
}
)
```
## Section
```elixir
platforms = EXLA.Client.get_supported_platforms()
```
```elixir
org = "oss-compass"
repos_dir = "/home/user/repos"
client = Tentacat.Client.new()
{200, repos, _} = Tentacat.Repositories.list_orgs(client, org)
```
```elixir
File.mkdir_p!(repos_dir)
Enum.each(repos, fn %{"full_name" => repo} ->
[_, project] = String.split(repo, "/")
path = Path.join([repos_dir, project])
if not File.exists?(path) do
System.cmd("git", ["clone", "https://gitee.com/#{repo}", "#{repos_dir}/#{project}"])
end
end)
```
```elixir
repos_dir = "/home/user/repos"
File.ls(repos_dir)
```
```elixir
data =
Path.wildcard(Path.join([repos_dir, "**", "*.{rb,py,rake}"]))
|> Enum.map(fn file ->
file = String.replace(file, repos_dir, "repos") |> IO.inspect()
["repos", project, path] = String.split(file, "/", parts: 3) |> IO.inspect()
contents = File.read!(Path.join([repos_dir, project, path]))
%{project: project, path: path, contents: contents}
end)
```
```elixir
defmodule FIM do
def permute(sample, opts \\ []) do
opts =
Keyword.validate!(opts, [
:prefix_token_id,
:middle_token_id,
:suffix_token_id,
:pad_token_id,
fim_rate: 0.5,
fim_spm_rate: 0.5
])
if :rand.uniform() > opts[:fim_rate] do
[low, high] = Enum.take_random(1..(Nx.axis_size(sample, 0) - 1), 2) |> Enum.sort()
prefix = sample[0..(low - 1)//1]
middle = sample[low..(high - 1)//1]
suffix = sample[high..-1//1]
Nx.concatenate([
Nx.tensor([opts[:prefix_token_id], opts[:suffix_token_id]]),
suffix,
Nx.tensor([opts[:middle_token_id]]),
prefix,
middle
])
else
pad_token = Nx.tensor([opts[:pad_token_id]])
Nx.concatenate([sample, pad_token, pad_token, pad_token])
end
end
end
```
```elixir
# %HTTPoison.Response{body: body} = HTTPoison.get!(
# "https://mirrors.gitee.com/gitee-mirrors/other/git-lfs-linux-amd64-v3.2.0.tar.gz",
# [{"User-Agent", "wget"}],[follow_redirect: true, proxy: "", timeout: 50_000, recv_timeout: 50_000])
# File.write!("/home/user/git-lfs.tar.gz", body)
# System.set_env("PATH", "$PATH:/home/user/bin")
# System.get_env("PATH")
```
```elixir
# git lfs install
# git clone https://huggingface.co/deepseek-ai/deepseek-coder-1.3b-base
# System.cmd("tar", ["-zxvf", "/home/user/git-lfs.tar.gz", "-C", "/home/user/lfs"])
# :erl_tar.extract({:binary, body}, [:compressed, {:cwd, "/home/user"}])
# File.ls("/home/user")
# System.cmd("git", ["lfs", "install"], [env: [{"PATH", "${PATH}:/home/user/git-lfs-3.2.0"}]]) |> IO.inspect(label: "LFS")
# File.ls("/home/user/model") |> IO.inspect()
# System.cmd("rm", ["-rf", "/home/user/model"])
# System.cmd("git", ["lfs", "install"], [env: [{"PATH", "${PATH}:/home/user/git-lfs-3.2.0"}]]) |> IO.inspect(label: "LFS")
# File.ls("/home/user/model") |> IO.inspect()
# System.cmd("git", ["clone", "https://huggingface.co/deepseek-ai/deepseek-coder-1.3b-base", "/home/user/model"], [env: [{"HTTP_PROXY", "http://211.99.101.2:30443"}, {"HTTPS_PROXY", "http://211.99.101.7:30443"}, {"PATH", "${PATH}:/home/user/git-lfs-3.2.0"}]])
```
```elixir
File.ls("/home/user/model") |> IO.inspect()
File.stat!("/home/user/model/pytorch_model.bin")
```
```elixir
{:ok, tokenizer} =
Bumblebee.load_tokenizer({:local, "/home/user/model"})
```
```elixir
fim_prefix = "<|fim▁begin|>"
fim_middle = "<|fim▁hole|>"
fim_suffix = "<|fim▁end|>"
pad_token = "<|end▁of▁sentence|>"
prefix_token_id = Tokenizers.Tokenizer.token_to_id(tokenizer.native_tokenizer, fim_prefix)
middle_token_id = Tokenizers.Tokenizer.token_to_id(tokenizer.native_tokenizer, fim_middle)
suffix_token_id = Tokenizers.Tokenizer.token_to_id(tokenizer.native_tokenizer, fim_suffix)
pad_token_id = Tokenizers.Tokenizer.token_to_id(tokenizer.native_tokenizer, pad_token)
```
```elixir
max_seq_len = 128
batch_size = 8
train_data =
data
|> Stream.flat_map(fn %{contents: contents} ->
tokenized = Bumblebee.apply_tokenizer(tokenizer, contents)
tokenized["input_ids"]
|> case do
# this will discard some examples, but that's okay
%{shape: {1, seq}} when seq < max_seq_len ->
[]
tensor ->
tensor
|> Nx.transpose()
|> Nx.to_batched(max_seq_len - 3, leftover: :discard)
|> Enum.map(&Nx.squeeze/1)
end
end)
|> Stream.map(
&FIM.permute(&1,
prefix_token_id: prefix_token_id,
middle_token_id: middle_token_id,
suffix_token_id: suffix_token_id,
pad_token_id: pad_token_id
)
)
|> Stream.chunk_every(batch_size, batch_size, leftover: :discard)
|> Stream.map(fn input_ids ->
batch = Nx.stack(input_ids)
{%{"input_ids" => batch}, batch}
end)
|> Stream.take(200)
```
```elixir
platforms = EXLA.Client.get_supported_platforms() |> IO.inspect()
repo = {:local, "/home/user/model"}
{:ok, model_info} = Bumblebee.load_model(repo)
```
```elixir
defmodule Trainer do
import Nx.Defn
defn causal_loss(labels, logits, opts \\ []) do
opts = keyword!(opts, [:pad_token_id])
# shift logits left and labels right
labels = labels[[.., 1..-1//1]]
logits = logits[[.., 0..-2//1, ..]]
padding_mask = Nx.equal(labels, opts[:pad_token_id])
Nx.select(
padding_mask,
0.0,
Axon.Losses.categorical_cross_entropy(Nx.new_axis(labels, -1), logits,
from_logits: true,
sparse: true
)
)
|> Nx.mean()
end
end
```
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。